Treelite
sklearn.cc
Go to the documentation of this file.
1 
7 #include <treelite/frontend.h>
8 #include <treelite/tree.h>
9 #include <memory>
10 #include <queue>
11 #include <algorithm>
12 #include <numeric>
13 #include <tuple>
14 
15 namespace treelite {
16 namespace frontend {
17 
18 template <typename MetaHandlerFunc, typename LeafHandlerFunc>
19 std::unique_ptr<treelite::Model> LoadSKLearnModel(
20  int n_trees, int n_features, int n_classes, const int64_t* node_count,
21  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
22  const double** threshold, const double** value, const int64_t** n_node_samples,
23  const double** impurity, MetaHandlerFunc meta_handler, LeafHandlerFunc leaf_handler) {
24  CHECK_GT(n_trees, 0);
25  CHECK_GT(n_features, 0);
26 
27  std::unique_ptr<treelite::Model> model_ptr = treelite::Model::Create<double, double>();
28  meta_handler(model_ptr.get(), n_features, n_classes);
29  auto* model = dynamic_cast<treelite::ModelImpl<double, double>*>(model_ptr.get());
30 
31  for (int tree_id = 0; tree_id < n_trees; ++tree_id) {
32  model->trees.emplace_back();
33  treelite::Tree<double, double>& tree = model->trees.back();
34  tree.Init();
35 
36  // Assign node ID's so that a breadth-wise traversal would yield
37  // the monotonic sequence 0, 1, 2, ...
38  std::queue<std::pair<int64_t, int>> Q; // (old ID, new ID) pair
39  Q.push({0, 0});
40  const int64_t total_sample_cnt = n_node_samples[tree_id][0];
41  while (!Q.empty()) {
42  int64_t node_id;
43  int new_node_id;
44  std::tie(node_id, new_node_id) = Q.front(); Q.pop();
45  const int64_t left_child_id = children_left[tree_id][node_id];
46  const int64_t right_child_id = children_right[tree_id][node_id];
47  const int64_t sample_cnt = n_node_samples[tree_id][node_id];
48  if (left_child_id == -1) { // leaf node
49  leaf_handler(tree_id, node_id, new_node_id, value, n_classes, tree);
50  } else {
51  const int64_t split_index = feature[tree_id][node_id];
52  const double split_cond = threshold[tree_id][node_id];
53  const int64_t left_child_sample_cnt = n_node_samples[tree_id][left_child_id];
54  const int64_t right_child_sample_cnt = n_node_samples[tree_id][right_child_id];
55  const double gain = sample_cnt * (
56  impurity[tree_id][node_id]
57  - left_child_sample_cnt * impurity[tree_id][left_child_id] / sample_cnt
58  - right_child_sample_cnt * impurity[tree_id][right_child_id] / sample_cnt)
59  / total_sample_cnt;
60 
61  tree.AddChilds(new_node_id);
62  tree.SetNumericalSplit(new_node_id, split_index, split_cond, true, treelite::Operator::kLE);
63  tree.SetGain(new_node_id, gain);
64  Q.push({left_child_id, tree.LeftChild(new_node_id)});
65  Q.push({right_child_id, tree.RightChild(new_node_id)});
66  }
67  tree.SetDataCount(new_node_id, sample_cnt);
68  }
69  }
70  return model_ptr;
71 }
72 
73 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestRegressor(
74  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
75  const int64_t** children_right, const int64_t** feature, const double** threshold,
76  const double** value, const int64_t** n_node_samples, const double** impurity) {
77  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
78  model->num_feature = n_features;
79  model->average_tree_output = true;
80  model->task_type = treelite::TaskType::kBinaryClfRegr;
81  model->task_param.grove_per_class = false;
82  model->task_param.output_type = treelite::TaskParameter::OutputType::kFloat;
83  model->task_param.num_class = 1;
84  model->task_param.leaf_vector_size = 1;
85  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
86  model->param.global_bias = 0.0f;
87  };
88  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
89  int n_classes, treelite::Tree<double, double>& dest_tree) {
90  const double leaf_value = value[tree_id][node_id];
91  dest_tree.SetLeaf(new_node_id, leaf_value);
92  };
93  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left, children_right,
94  feature, threshold, value, n_node_samples, impurity, meta_handler, leaf_handler);
95 }
96 
97 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifierBinary(
98  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
99  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
100  const double** threshold, const double** value, const int64_t** n_node_samples,
101  const double** impurity) {
102  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
103  model->num_feature = n_features;
104  model->average_tree_output = true;
105  model->task_type = treelite::TaskType::kBinaryClfRegr;
106  model->task_param.grove_per_class = false;
107  model->task_param.output_type = treelite::TaskParameter::OutputType::kFloat;
108  model->task_param.num_class = 1;
109  model->task_param.leaf_vector_size = 1;
110  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
111  model->param.global_bias = 0.0f;
112  };
113  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
114  int n_classes, treelite::Tree<double, double>& dest_tree) {
115  // Get counts for each label (+/-) at this leaf node
116  const double* leaf_count = &value[tree_id][node_id * 2];
117  // Compute the fraction of positive data points at this leaf node
118  const double fraction_positive = leaf_count[1] / (leaf_count[0] + leaf_count[1]);
119  dest_tree.SetLeaf(new_node_id, fraction_positive);
120  };
121  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
122  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
123  leaf_handler);
124 }
125 
126 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifierMulticlass(
127  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
128  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
129  const double** threshold, const double** value, const int64_t** n_node_samples,
130  const double** impurity) {
131  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
132  model->num_feature = n_features;
133  model->average_tree_output = true;
134  model->task_type = treelite::TaskType::kMultiClfProbDistLeaf;
135  model->task_param.grove_per_class = false;
136  model->task_param.output_type = treelite::TaskParameter::OutputType::kFloat;
137  model->task_param.num_class = n_classes;
138  model->task_param.leaf_vector_size = n_classes;
139  std::strncpy(model->param.pred_transform, "identity_multiclass",
140  sizeof(model->param.pred_transform));
141  model->param.global_bias = 0.0f;
142  };
143  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
144  int n_classes, treelite::Tree<double, double>& dest_tree) {
145  // Get counts for each label class at this leaf node
146  std::vector<double> prob_distribution(&value[tree_id][node_id * n_classes],
147  &value[tree_id][(node_id + 1) * n_classes]);
148  // Compute the probability distribution over label classes
149  const double norm_factor =
150  std::accumulate(prob_distribution.begin(), prob_distribution.end(), 0.0);
151  std::for_each(prob_distribution.begin(), prob_distribution.end(), [norm_factor](double& e) {
152  e /= norm_factor;
153  });
154  dest_tree.SetLeafVector(new_node_id, prob_distribution);
155  };
156  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
157  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
158  leaf_handler);
159 }
160 
161 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifier(
162  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
163  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
164  const double** threshold, const double** value, const int64_t** n_node_samples,
165  const double** impurity) {
166  CHECK_GE(n_classes, 2);
167  if (n_classes == 2) {
168  return LoadSKLearnRandomForestClassifierBinary(n_estimators, n_features, n_classes, node_count,
169  children_left, children_right, feature, threshold, value, n_node_samples, impurity);
170  } else {
171  return LoadSKLearnRandomForestClassifierMulticlass(n_estimators, n_features, n_classes,
172  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
173  impurity);
174  }
175 }
176 
177 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingRegressor(
178  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
179  const int64_t** children_right, const int64_t** feature, const double** threshold,
180  const double** value, const int64_t** n_node_samples, const double** impurity) {
181  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
182  model->num_feature = n_features;
183  model->average_tree_output = false;
184  model->task_type = treelite::TaskType::kBinaryClfRegr;
185  model->task_param.grove_per_class = false;
186  model->task_param.output_type = treelite::TaskParameter::OutputType::kFloat;
187  model->task_param.num_class = 1;
188  model->task_param.leaf_vector_size = 1;
189  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
190  model->param.global_bias = 0.0f;
191  };
192  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
193  int n_classes, treelite::Tree<double, double>& dest_tree) {
194  const double leaf_value = value[tree_id][node_id];
195  dest_tree.SetLeaf(new_node_id, leaf_value);
196  };
197  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left,
198  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
199  leaf_handler);
200 }
201 
202 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifierBinary(
203  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
204  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
205  const double** threshold, const double** value, const int64_t** n_node_samples,
206  const double** impurity) {
207  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
208  model->num_feature = n_features;
209  model->average_tree_output = false;
210  model->task_type = treelite::TaskType::kBinaryClfRegr;
211  model->task_param.grove_per_class = false;
212  model->task_param.output_type = treelite::TaskParameter::OutputType::kFloat;
213  model->task_param.num_class = 1;
214  model->task_param.leaf_vector_size = 1;
215  std::strncpy(model->param.pred_transform, "sigmoid", sizeof(model->param.pred_transform));
216  model->param.global_bias = 0.0f;
217  };
218  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
219  int n_classes, treelite::Tree<double, double>& dest_tree) {
220  const double leaf_value = value[tree_id][node_id];
221  dest_tree.SetLeaf(new_node_id, leaf_value);
222  };
223  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
224  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
225  leaf_handler);
226 }
227 
228 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifierMulticlass(
229  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
230  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
231  const double** threshold, const double** value, const int64_t** n_node_samples,
232  const double** impurity) {
233  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
234  model->num_feature = n_features;
235  model->average_tree_output = false;
236  model->task_type = treelite::TaskType::kMultiClfGrovePerClass;
237  model->task_param.grove_per_class = true;
238  model->task_param.output_type = treelite::TaskParameter::OutputType::kFloat;
239  model->task_param.num_class = n_classes;
240  model->task_param.leaf_vector_size = 1;
241  std::strncpy(model->param.pred_transform, "softmax", sizeof(model->param.pred_transform));
242  model->param.global_bias = 0.0f;
243  };
244  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
245  int n_classes, treelite::Tree<double, double>& dest_tree) {
246  const double leaf_value = value[tree_id][node_id];
247  dest_tree.SetLeaf(new_node_id, leaf_value);
248  };
249  return LoadSKLearnModel(n_estimators * n_classes, n_features, n_classes, node_count,
250  children_left, children_right, feature, threshold, value, n_node_samples, impurity,
251  meta_handler, leaf_handler);
252 }
253 
254 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifier(
255  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
256  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
257  const double** threshold, const double** value, const int64_t** n_node_samples,
258  const double** impurity) {
259  CHECK_GE(n_classes, 2);
260  if (n_classes == 2) {
261  return LoadSKLearnGradientBoostingClassifierBinary(n_estimators, n_features, n_classes,
262  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
263  impurity);
264  } else {
265  return LoadSKLearnGradientBoostingClassifierMulticlass(n_estimators, n_features, n_classes,
266  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
267  impurity);
268  }
269 }
270 
271 } // namespace frontend
272 } // namespace treelite
ModelParam param
extra parameters
Definition: tree.h:681
std::unique_ptr< treelite::Model > LoadSKLearnRandomForestRegressor(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn random forest regressor model from a collection of arrays. Refer to https://sciki...
Definition: sklearn.cc:73
Collection of front-end methods to load or construct ensemble model.
void Init()
initialize the model with a single root node
Definition: tree_impl.h:627
bool average_tree_output
whether to average tree outputs
Definition: tree.h:677
model structure for tree ensemble
in-memory representation of a decision tree
Definition: tree.h:197
float global_bias
global bias of the model
Definition: tree.h:606
TaskType task_type
Task type.
Definition: tree.h:675
std::unique_ptr< treelite::Model > LoadSKLearnRandomForestClassifier(int n_estimators, int n_features, int n_classes, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn random forest classifier model from a collection of arrays. Refer to https://scik...
Definition: sklearn.cc:161
std::unique_ptr< treelite::Model > LoadSKLearnGradientBoostingClassifier(int n_estimators, int n_features, int n_classes, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn gradient boosting classifier model from a collection of arrays. Refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to learn the mearning of the arrays in detail.
Definition: sklearn.cc:254
std::vector< Tree< ThresholdType, LeafOutputType > > trees
member trees
Definition: tree.h:705
std::unique_ptr< treelite::Model > LoadSKLearnGradientBoostingRegressor(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn gradient boosting regressor model from a collection of arrays. Refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to learn the mearning of the arrays in detail.
Definition: sklearn.cc:177
void SetGain(int nid, double gain)
set the gain value of the node
Definition: tree.h:560
TaskParameter task_param
Group of parameters that are specific to the particular task type.
Definition: tree.h:679
void SetDataCount(int nid, uint64_t data_count)
set the data count of the node
Definition: tree.h:550
int LeftChild(int nid) const
Getters.
Definition: tree.h:326
unsigned int num_class
The number of classes in the target label.
Definition: tree.h:183
int RightChild(int nid) const
index of the node&#39;s right child
Definition: tree.h:333
bool grove_per_class
Whether we designate a subset of the trees to compute the prediction for each class.
Definition: tree.h:175
void AddChilds(int nid)
add child nodes to node
Definition: tree_impl.h:640
thin wrapper for tree ensemble model
Definition: tree.h:632
OutputType output_type
The type of output from each leaf node.
Definition: tree.h:167
int num_feature
number of features used for the model. It is assumed that all feature indices are between 0 and [num_...
Definition: tree.h:673
unsigned int leaf_vector_size
Dimension of the output from each leaf node.
Definition: tree.h:190
void SetNumericalSplit(int nid, unsigned split_index, ThresholdType threshold, bool default_left, Operator cmp)
Setters.
Definition: tree_impl.h:678
char pred_transform[TREELITE_MAX_PRED_TRANSFORM_LENGTH]
name of prediction transform function
Definition: tree.h:591