Treelite
sklearn.cc
Go to the documentation of this file.
1 
7 #include <treelite/logging.h>
8 #include <treelite/frontend.h>
9 #include <treelite/tree.h>
10 #include <memory>
11 #include <queue>
12 #include <algorithm>
13 #include <numeric>
14 #include <tuple>
15 
16 namespace treelite {
17 namespace frontend {
18 
19 template <typename MetaHandlerFunc, typename LeafHandlerFunc>
20 std::unique_ptr<treelite::Model> LoadSKLearnModel(
21  int n_trees, int n_features, int n_classes, const int64_t* node_count,
22  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
23  const double** threshold, const double** value, const int64_t** n_node_samples,
24  const double** impurity, MetaHandlerFunc meta_handler, LeafHandlerFunc leaf_handler) {
25  TREELITE_CHECK_GT(n_trees, 0);
26  TREELITE_CHECK_GT(n_features, 0);
27 
28  std::unique_ptr<treelite::Model> model_ptr = treelite::Model::Create<double, double>();
29  meta_handler(model_ptr.get(), n_features, n_classes);
30  auto* model = dynamic_cast<treelite::ModelImpl<double, double>*>(model_ptr.get());
31 
32  for (int tree_id = 0; tree_id < n_trees; ++tree_id) {
33  model->trees.emplace_back();
34  treelite::Tree<double, double>& tree = model->trees.back();
35  tree.Init();
36 
37  // Assign node ID's so that a breadth-wise traversal would yield
38  // the monotonic sequence 0, 1, 2, ...
39  std::queue<std::pair<int64_t, int>> Q; // (old ID, new ID) pair
40  Q.push({0, 0});
41  const int64_t total_sample_cnt = n_node_samples[tree_id][0];
42  while (!Q.empty()) {
43  int64_t node_id;
44  int new_node_id;
45  std::tie(node_id, new_node_id) = Q.front(); Q.pop();
46  const int64_t left_child_id = children_left[tree_id][node_id];
47  const int64_t right_child_id = children_right[tree_id][node_id];
48  const int64_t sample_cnt = n_node_samples[tree_id][node_id];
49  if (left_child_id == -1) { // leaf node
50  leaf_handler(tree_id, node_id, new_node_id, value, n_classes, tree);
51  } else {
52  const int64_t split_index = feature[tree_id][node_id];
53  const double split_cond = threshold[tree_id][node_id];
54  const int64_t left_child_sample_cnt = n_node_samples[tree_id][left_child_id];
55  const int64_t right_child_sample_cnt = n_node_samples[tree_id][right_child_id];
56  const double gain = sample_cnt * (
57  impurity[tree_id][node_id]
58  - left_child_sample_cnt * impurity[tree_id][left_child_id] / sample_cnt
59  - right_child_sample_cnt * impurity[tree_id][right_child_id] / sample_cnt)
60  / total_sample_cnt;
61 
62  tree.AddChilds(new_node_id);
63  tree.SetNumericalSplit(new_node_id, split_index, split_cond, true, treelite::Operator::kLE);
64  tree.SetGain(new_node_id, gain);
65  Q.push({left_child_id, tree.LeftChild(new_node_id)});
66  Q.push({right_child_id, tree.RightChild(new_node_id)});
67  }
68  tree.SetDataCount(new_node_id, sample_cnt);
69  }
70  }
71  return model_ptr;
72 }
73 
74 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestRegressor(
75  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
76  const int64_t** children_right, const int64_t** feature, const double** threshold,
77  const double** value, const int64_t** n_node_samples, const double** impurity) {
78  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
79  model->num_feature = n_features;
80  model->average_tree_output = true;
81  model->task_type = treelite::TaskType::kBinaryClfRegr;
82  model->task_param.grove_per_class = false;
83  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
84  model->task_param.num_class = 1;
85  model->task_param.leaf_vector_size = 1;
86  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
87  model->param.global_bias = 0.0f;
88  };
89  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
90  int n_classes, treelite::Tree<double, double>& dest_tree) {
91  const double leaf_value = value[tree_id][node_id];
92  dest_tree.SetLeaf(new_node_id, leaf_value);
93  };
94  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left, children_right,
95  feature, threshold, value, n_node_samples, impurity, meta_handler, leaf_handler);
96 }
97 
98 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifierBinary(
99  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
100  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
101  const double** threshold, const double** value, const int64_t** n_node_samples,
102  const double** impurity) {
103  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
104  model->num_feature = n_features;
105  model->average_tree_output = true;
106  model->task_type = treelite::TaskType::kBinaryClfRegr;
107  model->task_param.grove_per_class = false;
108  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
109  model->task_param.num_class = 1;
110  model->task_param.leaf_vector_size = 1;
111  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
112  model->param.global_bias = 0.0f;
113  };
114  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
115  int n_classes, treelite::Tree<double, double>& dest_tree) {
116  // Get counts for each label (+/-) at this leaf node
117  const double* leaf_count = &value[tree_id][node_id * 2];
118  // Compute the fraction of positive data points at this leaf node
119  const double fraction_positive = leaf_count[1] / (leaf_count[0] + leaf_count[1]);
120  dest_tree.SetLeaf(new_node_id, fraction_positive);
121  };
122  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
123  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
124  leaf_handler);
125 }
126 
127 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifierMulticlass(
128  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
129  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
130  const double** threshold, const double** value, const int64_t** n_node_samples,
131  const double** impurity) {
132  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
133  model->num_feature = n_features;
134  model->average_tree_output = true;
135  model->task_type = treelite::TaskType::kMultiClfProbDistLeaf;
136  model->task_param.grove_per_class = false;
137  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
138  model->task_param.num_class = n_classes;
139  model->task_param.leaf_vector_size = n_classes;
140  std::strncpy(model->param.pred_transform, "identity_multiclass",
141  sizeof(model->param.pred_transform));
142  model->param.global_bias = 0.0f;
143  };
144  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
145  int n_classes, treelite::Tree<double, double>& dest_tree) {
146  // Get counts for each label class at this leaf node
147  std::vector<double> prob_distribution(&value[tree_id][node_id * n_classes],
148  &value[tree_id][(node_id + 1) * n_classes]);
149  // Compute the probability distribution over label classes
150  const double norm_factor =
151  std::accumulate(prob_distribution.begin(), prob_distribution.end(), 0.0);
152  std::for_each(prob_distribution.begin(), prob_distribution.end(), [norm_factor](double& e) {
153  e /= norm_factor;
154  });
155  dest_tree.SetLeafVector(new_node_id, prob_distribution);
156  };
157  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
158  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
159  leaf_handler);
160 }
161 
162 std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifier(
163  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
164  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
165  const double** threshold, const double** value, const int64_t** n_node_samples,
166  const double** impurity) {
167  TREELITE_CHECK_GE(n_classes, 2);
168  if (n_classes == 2) {
169  return LoadSKLearnRandomForestClassifierBinary(n_estimators, n_features, n_classes, node_count,
170  children_left, children_right, feature, threshold, value, n_node_samples, impurity);
171  } else {
172  return LoadSKLearnRandomForestClassifierMulticlass(n_estimators, n_features, n_classes,
173  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
174  impurity);
175  }
176 }
177 
178 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingRegressor(
179  int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
180  const int64_t** children_right, const int64_t** feature, const double** threshold,
181  const double** value, const int64_t** n_node_samples, const double** impurity) {
182  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
183  model->num_feature = n_features;
184  model->average_tree_output = false;
185  model->task_type = treelite::TaskType::kBinaryClfRegr;
186  model->task_param.grove_per_class = false;
187  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
188  model->task_param.num_class = 1;
189  model->task_param.leaf_vector_size = 1;
190  std::strncpy(model->param.pred_transform, "identity", sizeof(model->param.pred_transform));
191  model->param.global_bias = 0.0f;
192  };
193  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
194  int n_classes, treelite::Tree<double, double>& dest_tree) {
195  const double leaf_value = value[tree_id][node_id];
196  dest_tree.SetLeaf(new_node_id, leaf_value);
197  };
198  return LoadSKLearnModel(n_estimators, n_features, 1, node_count, children_left,
199  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
200  leaf_handler);
201 }
202 
203 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifierBinary(
204  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
205  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
206  const double** threshold, const double** value, const int64_t** n_node_samples,
207  const double** impurity) {
208  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
209  model->num_feature = n_features;
210  model->average_tree_output = false;
211  model->task_type = treelite::TaskType::kBinaryClfRegr;
212  model->task_param.grove_per_class = false;
213  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
214  model->task_param.num_class = 1;
215  model->task_param.leaf_vector_size = 1;
216  std::strncpy(model->param.pred_transform, "sigmoid", sizeof(model->param.pred_transform));
217  model->param.global_bias = 0.0f;
218  };
219  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
220  int n_classes, treelite::Tree<double, double>& dest_tree) {
221  const double leaf_value = value[tree_id][node_id];
222  dest_tree.SetLeaf(new_node_id, leaf_value);
223  };
224  return LoadSKLearnModel(n_estimators, n_features, n_classes, node_count, children_left,
225  children_right, feature, threshold, value, n_node_samples, impurity, meta_handler,
226  leaf_handler);
227 }
228 
229 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifierMulticlass(
230  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
231  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
232  const double** threshold, const double** value, const int64_t** n_node_samples,
233  const double** impurity) {
234  auto meta_handler = [](treelite::Model* model, int n_features, int n_classes) {
235  model->num_feature = n_features;
236  model->average_tree_output = false;
237  model->task_type = treelite::TaskType::kMultiClfGrovePerClass;
238  model->task_param.grove_per_class = true;
239  model->task_param.output_type = treelite::TaskParam::OutputType::kFloat;
240  model->task_param.num_class = n_classes;
241  model->task_param.leaf_vector_size = 1;
242  std::strncpy(model->param.pred_transform, "softmax", sizeof(model->param.pred_transform));
243  model->param.global_bias = 0.0f;
244  };
245  auto leaf_handler = [](int tree_id, int64_t node_id, int new_node_id, const double** value,
246  int n_classes, treelite::Tree<double, double>& dest_tree) {
247  const double leaf_value = value[tree_id][node_id];
248  dest_tree.SetLeaf(new_node_id, leaf_value);
249  };
250  return LoadSKLearnModel(n_estimators * n_classes, n_features, n_classes, node_count,
251  children_left, children_right, feature, threshold, value, n_node_samples, impurity,
252  meta_handler, leaf_handler);
253 }
254 
255 std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifier(
256  int n_estimators, int n_features, int n_classes, const int64_t* node_count,
257  const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
258  const double** threshold, const double** value, const int64_t** n_node_samples,
259  const double** impurity) {
260  TREELITE_CHECK_GE(n_classes, 2);
261  if (n_classes == 2) {
262  return LoadSKLearnGradientBoostingClassifierBinary(n_estimators, n_features, n_classes,
263  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
264  impurity);
265  } else {
266  return LoadSKLearnGradientBoostingClassifierMulticlass(n_estimators, n_features, n_classes,
267  node_count, children_left, children_right, feature, threshold, value, n_node_samples,
268  impurity);
269  }
270 }
271 
272 } // namespace frontend
273 } // namespace treelite
ModelParam param
extra parameters
Definition: tree.h:675
std::unique_ptr< treelite::Model > LoadSKLearnRandomForestRegressor(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn random forest regressor model from a collection of arrays. Refer to https://sciki...
Definition: sklearn.cc:74
Collection of front-end methods to load or construct ensemble model.
void Init()
initialize the model with a single root node
Definition: tree_impl.h:627
bool grove_per_class
Whether we designate a subset of the trees to compute the prediction for each class.
Definition: tree.h:168
bool average_tree_output
whether to average tree outputs
Definition: tree.h:671
model structure for tree ensemble
unsigned int leaf_vector_size
Dimension of the output from each leaf node.
Definition: tree.h:183
in-memory representation of a decision tree
Definition: tree.h:190
logging facility for Treelite
unsigned int num_class
The number of classes in the target label.
Definition: tree.h:176
float global_bias
global bias of the model
Definition: tree.h:600
TaskType task_type
Task type.
Definition: tree.h:669
std::unique_ptr< treelite::Model > LoadSKLearnRandomForestClassifier(int n_estimators, int n_features, int n_classes, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn random forest classifier model from a collection of arrays. Refer to https://scik...
Definition: sklearn.cc:162
std::unique_ptr< treelite::Model > LoadSKLearnGradientBoostingClassifier(int n_estimators, int n_features, int n_classes, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn gradient boosting classifier model from a collection of arrays. Refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to learn the mearning of the arrays in detail.
Definition: sklearn.cc:255
std::vector< Tree< ThresholdType, LeafOutputType > > trees
member trees
Definition: tree.h:699
std::unique_ptr< treelite::Model > LoadSKLearnGradientBoostingRegressor(int n_estimators, int n_features, const int64_t *node_count, const int64_t **children_left, const int64_t **children_right, const int64_t **feature, const double **threshold, const double **value, const int64_t **n_node_samples, const double **impurity)
Load a scikit-learn gradient boosting regressor model from a collection of arrays. Refer to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to learn the mearning of the arrays in detail.
Definition: sklearn.cc:178
void SetGain(int nid, double gain)
set the gain value of the node
Definition: tree.h:556
void SetDataCount(int nid, uint64_t data_count)
set the data count of the node
Definition: tree.h:546
int LeftChild(int nid) const
Getters.
Definition: tree.h:322
TaskParam task_param
Group of parameters that are specific to the particular task type.
Definition: tree.h:673
int RightChild(int nid) const
index of the node&#39;s right child
Definition: tree.h:329
void AddChilds(int nid)
add child nodes to node
Definition: tree_impl.h:640
OutputType output_type
The type of output from each leaf node.
Definition: tree.h:160
thin wrapper for tree ensemble model
Definition: tree.h:626
int num_feature
number of features used for the model. It is assumed that all feature indices are between 0 and [num_...
Definition: tree.h:667
void SetNumericalSplit(int nid, unsigned split_index, ThresholdType threshold, bool default_left, Operator cmp)
Setters.
Definition: tree_impl.h:678
char pred_transform[TREELITE_MAX_PRED_TRANSFORM_LENGTH]
name of prediction transform function
Definition: tree.h:585