Treelite
sklearn.cc
Go to the documentation of this file.
1 
7 #include <treelite/logging.h>
8 #include <treelite/frontend.h>
9 #include <treelite/tree.h>
10 #include <memory>
11 #include <queue>
12 #include <algorithm>
13 #include <numeric>
14 #include <tuple>
15 
16 namespace treelite {
17 namespace frontend {
18 
19 template <typename MetaHandlerFunc, typename LeafHandlerFunc>
20 std::unique_ptr<treelite::Model> LoadSKLearnModel(
21  int n_trees, int n_features, int n_classes, const int64_t* node_count,
22  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
23  const double** threshold, const double** value, const int64_t** n_node_samples,
24  const double** weighted_n_node_samples, const double** impurity, MetaHandlerFunc meta_handler,
25  LeafHandlerFunc leaf_handler) {
26  TREELITE_CHECK_GT(n_trees, 0);
27  TREELITE_CHECK_GT(n_features, 0);
28 
29  std::unique_ptr<treelite::Model> model_ptr = treelite::Model::Create<double, double>();
30  meta_handler(model_ptr.get(), n_features, n_classes);
31  auto* model = dynamic_cast<treelite::ModelImpl<double, double>*>(model_ptr.get());
32 
33  for (int tree_id = 0; tree_id < n_trees; ++tree_id) {
34  model->trees.emplace_back();
35  treelite::Tree<double, double>& tree = model->trees.back();
36  tree.Init();
37 
38  // Assign node ID's so that a breadth-wise traversal would yield
39  // the monotonic sequence 0, 1, 2, ...
40  std::queue<std::pair<int64_t, int>> Q; // (old ID, new ID) pair
41  Q.push({0, 0});
42  const int64_t total_sample_cnt = n_node_samples[tree_id][0];
43  while (!Q.empty()) {
44  int64_t node_id;
45  int new_node_id;
46  std::tie(node_id, new_node_id) = Q.front(); Q.pop();
47  const int64_t left_child_id = children_left[tree_id][node_id];
48  const int64_t right_child_id = children_right[tree_id][node_id];
49  const int64_t sample_cnt = n_node_samples[tree_id][node_id];
50  const double weighted_sample_cnt = weighted_n_node_samples[tree_id][node_id];
51  if (left_child_id == -1) { // leaf node
52  leaf_handler(tree_id, node_id, new_node_id, value, n_classes, tree);
53  } else {
54  const int64_t split_index = feature[tree_id][node_id];
55  const double split_cond = threshold[tree_id][node_id];
56  const int64_t left_child_sample_cnt = n_node_samples[tree_id][left_child_id];
57  const int64_t right_child_sample_cnt = n_node_samples[tree_id][right_child_id];
58  const double gain = sample_cnt * (
59  impurity[tree_id][node_id]
60  - left_child_sample_cnt * impurity[tree_id][left_child_id] / sample_cnt
61  - right_child_sample_cnt * impurity[tree_id][right_child_id] / sample_cnt)
62  / total_sample_cnt;
63 
64  tree.AddChilds(new_node_id);
65  tree.SetNumericalSplit(new_node_id, split_index, split_cond, true, treelite::Operator::kLE);
66  tree.SetGain(new_node_id, gain);
67  Q.push({left_child_id, tree.LeftChild(new_node_id)});
68  Q.push({right_child_id, tree.RightChild(new_node_id)});
69  }
70  tree.SetDataCount(new_node_id, sample_cnt);
71  tree.SetSumHess(new_node_id, weighted_sample_cnt);
72  }
73  }
74  return model_ptr;
75 }
76 
77 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestRegressor(
78  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
79  const int64_t** children_right, const int64_t** feature, const double** threshold,
80  const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
81  const double** impurity) {
82  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
83  model->num_feature = n_features;
84  model->average_tree_output = true;
85  model->task_type = treelite::TaskType::kBinaryClfRegr;
86  model->task_param.grove_per_class = false;
87  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
88  model->task_param.num_class = 1;
89  model->task_param.leaf_vector_size = 1;
90  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
91  model->param.global_bias = 0.0f;
92  };
93  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
94  int n_classes, treelite::Tree<double, double>& dest_tree) {
95  const double leaf_value = value[tree_id][node_id];
96  dest_tree.SetLeaf(new_node_id, leaf_value);
97  };
98  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left, children_right,
99  feature, threshold, value, n_node_samples, weighted_n_node_samples, impurity, meta_handler,
100  leaf_handler);
101 }
102 
103 std::unique_ptr<treelite::Model> LoadSKLearnIsolationForest(
104  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
105  const int64_t** children_right, const int64_t** feature, const double** threshold,
106  const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
107  const double** impurity, const double ratio_c) {
108  auto meta_handler = [ratio_c](treelite::Model* model, int n_features, int n_classes) {
109  model->num_feature = n_features;
110  model->average_tree_output = true;
111  model->task_type = treelite::TaskType::kBinaryClfRegr;
112  model->task_param.grove_per_class = false;
113  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
114  model->task_param.num_class = 1;
115  model->task_param.leaf_vector_size = 1;
116  std::strncpy(
117  model->param.pred_transform, "exponential_standard_ratio",
118  sizeof(model->param.pred_transform));
119  model->param.ratio_c = ratio_c;
120  };
121  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
122  int n_classes, treelite::Tree<double, double>& dest_tree) {
123  const double leaf_value = value[tree_id][node_id];
124  dest_tree.SetLeaf(new_node_id, leaf_value);
125  };
126  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left, children_right,
127  feature, threshold, value, n_node_samples, weighted_n_node_samples, impurity, meta_handler,
128  leaf_handler);
129 }
130 
131 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifierBinary(
132  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
133  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
134  const double** threshold, const double** value, const int64_t** n_node_samples,
135  const double** weighted_n_node_samples, const double** impurity) {
136  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
137  model->num_feature = n_features;
138  model->average_tree_output = true;
139  model->task_type = treelite::TaskType::kBinaryClfRegr;
140  model->task_param.grove_per_class = false;
141  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
142  model->task_param.num_class = 1;
143  model->task_param.leaf_vector_size = 1;
144  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
145  model->param.global_bias = 0.0f;
146  };
147  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
148  int n_classes, treelite::Tree<double, double>& dest_tree) {
149  // Get counts for each label (+/-) at this leaf node
150  const double* leaf_count = &value[tree_id][node_id * 2];
151  // Compute the fraction of positive data points at this leaf node
152  const double fraction_positive = leaf_count[1] / (leaf_count[0] + leaf_count[1]);
153  dest_tree.SetLeaf(new_node_id, fraction_positive);
154  };
155  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
156  children_right, feature, threshold, value, n_node_samples, weighted_n_node_samples, impurity,
157  meta_handler, leaf_handler);
158 }
159 
160 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifierMulticlass(
161  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
162  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
163  const double** threshold, const double** value, const int64_t** n_node_samples,
164  const double** weighted_n_node_samples, const double** impurity) {
165  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
166  model->num_feature = n_features;
167  model->average_tree_output = true;
168  model->task_type = treelite::TaskType::kMultiClfProbDistLeaf;
169  model->task_param.grove_per_class = false;
170  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
171  model->task_param.num_class = n_classes;
172  model->task_param.leaf_vector_size = n_classes;
173  std::strncpy(model->param.pred_transform, "identity_multiclass",
174  sizeof(model->param.pred_transform));
175  model->param.global_bias = 0.0f;
176  };
177  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
178  int n_classes, treelite::Tree<double, double>& dest_tree) {
179  // Get counts for each label class at this leaf node
180  std::vector<double> prob_distribution(&value[tree_id][node_id * n_classes],
181  &value[tree_id][(node_id + 1) * n_classes]);
182  // Compute the probability distribution over label classes
183  const double norm_factor =
184  std::accumulate(prob_distribution.begin(), prob_distribution.end(), 0.0);
185  std::for_each(prob_distribution.begin(), prob_distribution.end(), [norm_factor](double& e) {
186  e /= norm_factor;
187  });
188  dest_tree.SetLeafVector(new_node_id, prob_distribution);
189  };
190  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
191  children_right, feature, threshold, value, n_node_samples, weighted_n_node_samples, impurity,
192  meta_handler, leaf_handler);
193 }
194 
195 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifier(
196  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
197  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
198  const double** threshold, const double** value, const int64_t** n_node_samples,
199  const double** weighted_n_node_samples, const double** impurity) {
200  TREELITE_CHECK_GE(n_classes, 2);
201  if (n_classes == 2) {
202  return LoadSKLearnRandomForestClassifierBinary(n_estimators, n_features, n_classes, node_count,
203  children_left, children_right, feature, threshold, value, n_node_samples,
204  weighted_n_node_samples, impurity);
205  } else {
206  return LoadSKLearnRandomForestClassifierMulticlass(n_estimators, n_features, n_classes,
207  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
208  weighted_n_node_samples, impurity);
209  }
210 }
211 
212 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingRegressor(
213  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
214  const int64_t** children_right, const int64_t** feature, const double** threshold,
215  const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
216  const double** impurity) {
217  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
218  model->num_feature = n_features;
219  model->average_tree_output = false;
220  model->task_type = treelite::TaskType::kBinaryClfRegr;
221  model->task_param.grove_per_class = false;
222  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
223  model->task_param.num_class = 1;
224  model->task_param.leaf_vector_size = 1;
225  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
226  model->param.global_bias = 0.0f;
227  };
228  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
229  int n_classes, treelite::Tree<double, double>& dest_tree) {
230  const double leaf_value = value[tree_id][node_id];
231  dest_tree.SetLeaf(new_node_id, leaf_value);
232  };
233  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left,
234  children_right, feature, threshold, value, n_node_samples, weighted_n_node_samples, impurity,
235  meta_handler, leaf_handler);
236 }
237 
238 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifierBinary(
239  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
240  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
241  const double** threshold, const double** value, const int64_t** n_node_samples,
242  const double** weighted_n_node_samples, const double** impurity) {
243  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
244  model->num_feature = n_features;
245  model->average_tree_output = false;
246  model->task_type = treelite::TaskType::kBinaryClfRegr;
247  model->task_param.grove_per_class = false;
248  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
249  model->task_param.num_class = 1;
250  model->task_param.leaf_vector_size = 1;
251  std::strncpy(model->param.pred_transform, "sigmoid", sizeof(model->param.pred_transform));
252  model->param.global_bias = 0.0f;
253  };
254  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
255  int n_classes, treelite::Tree<double, double>& dest_tree) {
256  const double leaf_value = value[tree_id][node_id];
257  dest_tree.SetLeaf(new_node_id, leaf_value);
258  };
259  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
260  children_right, feature, threshold, value, n_node_samples, weighted_n_node_samples, impurity,
261  meta_handler, leaf_handler);
262 }
263 
264 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifierMulticlass(
265  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
266  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
267  const double** threshold, const double** value, const int64_t** n_node_samples,
268  const double** weighted_n_node_samples, const double** impurity) {
269  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
270  model->num_feature = n_features;
271  model->average_tree_output = false;
272  model->task_type = treelite::TaskType::kMultiClfGrovePerClass;
273  model->task_param.grove_per_class = true;
274  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
275  model->task_param.num_class = n_classes;
276  model->task_param.leaf_vector_size = 1;
277  std::strncpy(model->param.pred_transform, "softmax", sizeof(model->param.pred_transform));
278  model->param.global_bias = 0.0f;
279  };
280  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
281  int n_classes, treelite::Tree<double, double>& dest_tree) {
282  const double leaf_value = value[tree_id][node_id];
283  dest_tree.SetLeaf(new_node_id, leaf_value);
284  };
285  return LoadSKLearnModel(n_estimators * n_classes, n_features, n_classes, node_count,
286  children_left, children_right, feature, threshold, value, n_node_samples,
287  weighted_n_node_samples, impurity, meta_handler, leaf_handler);
288 }
289 
290 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifier(
291  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
292  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
293  const double** threshold, const double** value, const int64_t** n_node_samples,
294  const double** weighted_n_node_samples, const double** impurity) {
295  TREELITE_CHECK_GE(n_classes, 2);
296  if (n_classes == 2) {
297  return LoadSKLearnGradientBoostingClassifierBinary(n_estimators, n_features, n_classes,
298  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
299  weighted_n_node_samples, impurity);
300  } else {
301  return LoadSKLearnGradientBoostingClassifierMulticlass(n_estimators, n_features, n_classes,
302  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
303  weighted_n_node_samples, impurity);
304  }
305 }
306 
307 } // namespace frontend
308 } // namespace treelite
ModelParam param
extra parameters
Definition: tree.h:801
Collection of front-end methods to load or construct ensemble model.
void Init()
initialize the model with a single root node
Definition: tree_impl.h:704
bool grove_per_class
Whether we designate a subset of the trees to compute the prediction for each class.
Definition: tree.h:192
bool average_tree_output
whether to average tree outputs
Definition: tree.h:797
model structure for tree ensemble
unsigned int leaf_vector_size
Dimension of the output from each leaf node.
Definition: tree.h:207
void SetSumHess(int nid, double sum_hess)
set the hessian sum of the node
Definition: tree.h:636
in-memory representation of a decision tree
Definition: tree.h:222
logging facility for Treelite
unsigned int num_class
The number of classes in the target label.
Definition: tree.h:200
float global_bias
global bias of the model
Definition: tree.h:708
TaskType task_type
Task type.
Definition: tree.h:795
int32_t num_feature
number of features used for the model. It is assumed that all feature indices are between 0 and [num_...
Definition: tree.h:793
std::vector< Tree< ThresholdType, LeafOutputType > > trees
member trees
Definition: tree.h:838
std::unique_ptr< treelite::Model > LoadSKLearnRandomForestClassifier(int n_estimators, int n_features, int n_classes, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **weighted_n_node_samples, const double **impurity)
Load a scikit-learn random forest classifier model from a collection of arrays. Refer to https://scik...
Definition: sklearn.cc:195
void SetGain(int nid, double gain)
set the gain value of the node
Definition: tree.h:656
void SetDataCount(int nid, uint64_t data_count)
set the data count of the node
Definition: tree.h:646
float ratio_c
scaling parameter for exponential standard ratio transformation expstdratio(x) = exp2(-x / c) ...
Definition: tree.h:701
int LeftChild(int nid) const
Getters.
Definition: tree.h:423
TaskParam task_param
Group of parameters that are specific to the particular task type.
Definition: tree.h:799
std::unique_ptr< treelite::Model > LoadSKLearnRandomForestRegressor(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **weighted_n_node_samples, const double **impurity)
Load a scikit-learn random forest regressor model from a collection of arrays. Refer to https://sciki...
Definition: sklearn.cc:77
int RightChild(int nid) const
index of the node&#39;s right child
Definition: tree.h:430
void AddChilds(int nid)
add child nodes to node
Definition: tree_impl.h:719
OutputType output_type
The type of output from each leaf node.
Definition: tree.h:184
std::unique_ptr< treelite::Model > LoadSKLearnGradientBoostingRegressor(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **weighted_n_node_samples, const double **impurity)
Load a scikit-learn gradient boosting regressor model from a collection of arrays. Refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to learn the mearning of the arrays in detail.
Definition: sklearn.cc:212
thin wrapper for tree ensemble model
Definition: tree.h:734
std::unique_ptr< treelite::Model > LoadSKLearnIsolationForest(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **weighted_n_node_samples, const double **impurity, const double ratio_c)
Load a scikit-learn isolation forest model from a collection of arrays. Refer to https://scikit-learn...
Definition: sklearn.cc:103
void SetNumericalSplit(int nid, unsigned split_index, ThresholdType threshold, bool default_left, Operator cmp)
Setters.
Definition: tree_impl.h:728
std::unique_ptr< treelite::Model > LoadSKLearnGradientBoostingClassifier(int n_estimators, int n_features, int n_classes, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **weighted_n_node_samples, const double **impurity)
Load a scikit-learn gradient boosting classifier model from a collection of arrays. Refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to learn the mearning of the arrays in detail.
Definition: sklearn.cc:290
char pred_transform[TREELITE_MAX_PRED_TRANSFORM_LENGTH]
name of prediction transform function
Definition: tree.h:685