7 #ifndef TREELITE_TREE_H_ 8 #define TREELITE_TREE_H_ 17 #include <type_traits> 23 #define __TREELITE_STR(x) #x 24 #define _TREELITE_STR(x) __TREELITE_STR(x) 26 #define TREELITE_MAX_PRED_TRANSFORM_LENGTH 256 32 float stof(
const std::string& value,
size_t* pos);
56 inline void UseForeignBuffer(
void* prealloc_buf,
size_t size);
58 inline const T* Data()
const;
60 inline const T* End()
const;
62 inline const T& Back()
const;
63 inline size_t Size()
const;
64 inline void Reserve(
size_t newsize);
65 inline void Resize(
size_t newsize);
66 inline void Resize(
size_t newsize, T t);
68 inline void PushBack(T t);
69 inline void Extend(
const std::vector<T>& other);
71 inline T& operator[](
size_t idx);
72 inline const T& operator[](
size_t idx)
const;
74 inline T& at(
size_t idx);
75 inline const T& at(
size_t idx)
const;
77 inline T& at(
int idx);
78 inline const T& at(
int idx)
const;
79 static_assert(std::is_pod<T>::value,
"T must be POD");
121 kMultiClfGrovePerClass = 1,
137 kMultiClfProbDistLeaf = 2,
154 kMultiClfCategLeaf = 3
159 enum class OutputType : uint8_t { kFloat = 0, kInt = 1 };
187 static_assert(std::is_pod<TaskParameter>::value,
"TaskParameter must be POD type");
190 template <
typename ThresholdType,
typename LeafOutputType>
200 LeafOutputType leaf_value;
201 ThresholdType threshold;
244 bool categories_list_right_child_;
247 static_assert(std::is_pod<Node>::value,
"Node must be a POD type");
248 static_assert(std::is_same<ThresholdType, float>::value
249 || std::is_same<ThresholdType, double>::value,
250 "ThresholdType must be either float32 or float64");
251 static_assert(std::is_same<LeafOutputType, uint32_t>::value
252 || std::is_same<LeafOutputType, float>::value
253 || std::is_same<LeafOutputType, double>::value,
254 "LeafOutputType must be one of uint32_t, float32 or float64");
255 static_assert(std::is_same<ThresholdType, LeafOutputType>::value
256 || std::is_same<LeafOutputType, uint32_t>::value,
257 "Unsupported combination of ThresholdType and LeafOutputType");
258 static_assert((std::is_same<ThresholdType, float>::value &&
sizeof(
Node) == 48)
259 || (std::is_same<ThresholdType, double>::value &&
sizeof(
Node) == 56),
260 "Node size incorrect");
265 Tree& operator=(
const Tree&) =
delete;
267 Tree& operator=(
Tree&&) noexcept = default;
269 inline
Tree<ThresholdType, LeafOutputType> Clone() const;
271 inline const
char* GetFormatStringForNode();
273 inline
void InitFromPyBuffer(std::vector<
PyBufferFrame>::iterator begin,
285 inline
int AllocNode();
296 inline
void AddChilds(
int nid);
302 inline std::vector<
unsigned> GetCategoricalFeatures() const;
309 inline
int LeftChild(
int nid)
const {
310 return nodes_.at(nid).cleft_;
317 return nodes_.at(nid).cright_;
324 return DefaultLeft(nid) ? LeftChild(nid) : RightChild(nid);
331 return (nodes_.at(nid).sindex_ & ((1U << 31U) - 1U));
338 return (nodes_.at(nid).sindex_ >> 31U) != 0;
345 return nodes_.at(nid).cleft_ == -1;
352 return (nodes_.at(nid).info_).leaf_value;
358 inline std::vector<LeafOutputType>
LeafVector(
int nid)
const {
359 const size_t offset_begin = leaf_vector_offset_.at(nid);
360 const size_t offset_end = leaf_vector_offset_.at(nid + 1);
361 if (offset_begin >= leaf_vector_.Size() || offset_end > leaf_vector_.Size()) {
363 return std::vector<LeafOutputType>();
365 return std::vector<LeafOutputType>(&leaf_vector_[offset_begin],
366 &leaf_vector_[offset_end]);
375 return leaf_vector_offset_.at(nid) != leaf_vector_offset_.at(nid + 1);
382 return (nodes_.at(nid).info_).threshold;
389 return nodes_.at(nid).cmp_;
400 const size_t offset_begin = matching_categories_offset_.at(nid);
401 const size_t offset_end = matching_categories_offset_.at(nid + 1);
402 if (offset_begin >= matching_categories_.Size() || offset_end > matching_categories_.Size()) {
405 return std::vector<uint32_t>();
407 return std::vector<uint32_t>(&matching_categories_[offset_begin],
408 &matching_categories_[offset_end]);
418 return matching_categories_offset_.at(nid) != matching_categories_offset_.at(nid + 1);
425 return nodes_.at(nid).split_type_;
432 return nodes_.at(nid).data_count_present_;
439 return nodes_.at(nid).data_count_;
447 return nodes_.at(nid).sum_hess_present_;
454 return nodes_.at(nid).sum_hess_;
461 return nodes_.at(nid).gain_present_;
467 inline double Gain(
int nid)
const {
468 return nodes_.at(nid).gain_;
476 return nodes_.at(nid).categories_list_right_child_;
489 inline void SetNumericalSplit(
int nid,
unsigned split_index, ThresholdType threshold,
503 inline void SetCategoricalSplit(
int nid,
unsigned split_index,
bool default_left,
504 const std::vector<uint32_t>& categories_list,
505 bool categories_list_right_child);
511 inline void SetLeaf(
int nid, LeafOutputType value);
517 inline void SetLeafVector(
int nid,
const std::vector<LeafOutputType>& leaf_vector);
524 Node& node = nodes_.at(nid);
525 node.sum_hess_ = sum_hess;
526 node.sum_hess_present_ =
true;
534 Node& node = nodes_.at(nid);
535 node.data_count_ = data_count;
536 node.data_count_present_ =
true;
544 Node& node = nodes_.at(nid);
546 node.gain_present_ =
true;
549 void ReferenceSerialize(dmlc::Stream* fo)
const;
574 char pred_transform[TREELITE_MAX_PRED_TRANSFORM_LENGTH] = {0};
592 ModelParam() : sigmoid_alpha(1.0f), global_bias(0.0f) {
593 std::memset(pred_transform, 0, TREELITE_MAX_PRED_TRANSFORM_LENGTH *
sizeof(
char));
594 std::strncpy(pred_transform,
"identity",
sizeof(pred_transform));
602 template<
typename Container>
603 inline std::vector<std::pair<std::string, std::string>>
604 InitAllowUnknown(
const Container &kwargs);
605 inline std::map<std::string, std::string> __DICT__()
const;
608 static_assert(std::is_standard_layout<ModelParam>::value,
609 "ModelParam must be in the standard layout");
611 inline void InitParamAndCheck(
ModelParam* param,
612 const std::vector<std::pair<std::string, std::string>>& cfg);
619 virtual ~
Model() =
default;
625 template <
typename ThresholdType,
typename LeafOutputType>
626 inline static std::unique_ptr<Model> Create();
627 inline static std::unique_ptr<Model> Create(
TypeInfo threshold_type,
TypeInfo leaf_output_type);
628 inline TypeInfo GetThresholdType()
const {
629 return threshold_type_;
631 inline TypeInfo GetLeafOutputType()
const {
632 return leaf_output_type_;
634 template <
typename Func>
635 inline auto Dispatch(Func func);
636 template <
typename Func>
637 inline auto Dispatch(Func func)
const;
639 virtual size_t GetNumTree()
const = 0;
640 virtual void SetTreeLimit(
size_t limit) = 0;
641 virtual void ReferenceSerialize(dmlc::Stream* fo)
const = 0;
643 inline std::vector<PyBufferFrame> GetPyBuffer();
644 inline static std::unique_ptr<Model> CreateFromPyBuffer(std::vector<PyBufferFrame> frames);
664 virtual void GetPyBuffer(std::vector<PyBufferFrame>* dest) = 0;
665 virtual void InitFromPyBuffer(std::vector<PyBufferFrame>::iterator begin,
666 std::vector<PyBufferFrame>::iterator end) = 0;
669 template <
typename ThresholdType,
typename LeafOutputType>
673 std::vector<Tree<ThresholdType, LeafOutputType>>
trees;
683 void ReferenceSerialize(
dmlc::Stream* fo) const override;
684 inline
size_t GetNumTree()
const override {
687 void SetTreeLimit(
size_t limit)
override {
688 return trees.resize(limit);
691 inline void GetPyBuffer(std::vector<PyBufferFrame>* dest)
override;
692 inline void InitFromPyBuffer(std::vector<PyBufferFrame>::iterator begin,
693 std::vector<PyBufferFrame>::iterator end)
override;
700 #endif // TREELITE_TREE_H_ ModelParam param
extra parameters
SplitFeatureType split_type_
feature split type
Operator ComparisonOp(int nid) const
get comparison operator
Implementation for tree.h.
bool gain_present_
whether gain_present_ field is present
SplitFeatureType
feature split type
uint64_t data_count_
number of data points whose traversal paths include this node. LightGBM models natively store this st...
bool HasDataCount(int nid) const
test whether this node has data count
bool HasGain(int nid) const
test whether this node has gain value
Operator cmp_
operator to use for expression of form [fval] OP [threshold]. If the expression evaluates to true...
std::vector< LeafOutputType > LeafVector(int nid) const
get leaf vector of the leaf node; useful for multi-class random forest classifier ...
TaskType
Enum type representing the task type.
bool average_tree_output
whether to average tree outputs
float sigmoid_alpha
scaling parameter for sigmoid function sigmoid(x) = 1 / (1 + exp(-alpha * x))
int DefaultChild(int nid) const
index of the node's "default" child, used when feature is missing
bool data_count_present_
whether data_count_ field is present
int32_t cleft_
pointer to left and right children
void SetSumHess(int nid, double sum_hess)
set the hessian sum of the node
in-memory representation of a decision tree
double sum_hess_
sum of hessian values for all data points whose traversal paths include this node. This value is generally correlated positively with the data count. XGBoost models natively store this statistics.
float global_bias
global bias of the model
double gain_
change in loss that is attributed to a particular split
TaskType task_type
Task type.
uint32_t SplitIndex(int nid) const
feature index of the node's split condition
TypeInfo
Types used by thresholds and leaf outputs.
Group of parameters that are dependent on the choice of the task type.
store either leaf value or decision threshold
std::vector< Tree< ThresholdType, LeafOutputType > > trees
member trees
double SumHess(int nid) const
get hessian sum
void SetGain(int nid, double gain)
set the gain value of the node
TaskParameter task_param
Group of parameters that are specific to the particular task type.
void SetDataCount(int nid, uint64_t data_count)
set the data count of the node
bool CategoriesListRightChild(int nid) const
test whether the list given by MatchingCategories(nid) is associated with the right child node or the...
SplitFeatureType SplitType(int nid) const
get feature split type
LeafOutputType LeafValue(int nid) const
get leaf value of the leaf node
defines configuration macros of Treelite
std::vector< uint32_t > MatchingCategories(int nid) const
Get list of all categories belonging to the left/right child node. See the categories_list_right_chil...
unsigned int num_class
The number of classes in the target label.
uint64_t DataCount(int nid) const
get data count
double Gain(int nid) const
get gain value
int RightChild(int nid) const
index of the node's right child
bool grove_per_class
Whether we designate a subset of the trees to compute the prediction for each class.
bool HasMatchingCategories(int nid) const
tests whether the node has a non-empty list for matching categories. See MatchingCategories() for the...
bool sum_hess_present_
whether sum_hess_ field is present
thin wrapper for tree ensemble model
bool DefaultLeft(int nid) const
whether to use the left child node, when the feature in the split condition is missing ...
bool HasSumHess(int nid) const
test whether this node has hessian sum
bool IsLeaf(int nid) const
whether the node is leaf node
OutputType output_type
The type of output from each leaf node.
int num_feature
number of features used for the model. It is assumed that all feature indices are between 0 and [num_...
uint32_t sindex_
feature index used for the split highest bit indicates default direction for missing values ...
unsigned int leaf_vector_size
Dimension of the output from each leaf node.
ThresholdType Threshold(int nid) const
get threshold of the node
bool HasLeafVector(int nid) const
tests whether the leaf node has a non-empty leaf vector
Info info_
storage for leaf value or decision threshold
Operator
comparison operators