Skip to content

Commit

Permalink
Add weighted_n_node_samples field in sklearn importer
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Dec 20, 2021
1 parent 5d80909 commit 5c6861a
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 62 deletions.
26 changes: 18 additions & 8 deletions include/treelite/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ TREELITE_DLL int TreeliteLoadXGBoostModelFromMemoryBuffer(const void* buf,
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \param out pointer to store the loaded model
Expand All @@ -189,8 +191,8 @@ TREELITE_DLL int TreeliteLoadXGBoostModelFromMemoryBuffer(const void* buf,
TREELITE_DLL int TreeliteLoadSKLearnRandomForestRegressor(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity,
ModelHandle* out);
const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
const double** impurity, ModelHandle* out);

/*!
* \brief Load a scikit-learn isolation forest model from a collection of arrays. Refer to
Expand All @@ -211,6 +213,8 @@ TREELITE_DLL int TreeliteLoadSKLearnRandomForestRegressor(
* only defined if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity not used, but must be passed as array of arrays for each tree and node.
* \param ratio_c standardizing constant to use for calculation of the anomaly score.
* \param out pointer to store the loaded model
Expand All @@ -219,8 +223,8 @@ TREELITE_DLL int TreeliteLoadSKLearnRandomForestRegressor(
TREELITE_DLL int TreeliteLoadSKLearnIsolationForest(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity,
const double ratio_c, ModelHandle* out);
const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
const double** impurity, const double ratio_c, ModelHandle* out);

/*!
* \brief Load a scikit-learn random forest classifier model from a collection of arrays. Refer to
Expand All @@ -243,6 +247,8 @@ TREELITE_DLL int TreeliteLoadSKLearnIsolationForest(
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \param out pointer to store the loaded model
Expand All @@ -252,7 +258,7 @@ TREELITE_DLL int TreeliteLoadSKLearnRandomForestClassifier(
int n_estimators, int n_features, int n_classes, const int64_t* node_count,
const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
const double** threshold, const double** value, const int64_t** n_node_samples,
const double** impurity, ModelHandle* out);
const double** weighted_n_node_samples, const double** impurity, ModelHandle* out);

/*!
* \brief Load a scikit-learn gradient boosting regressor model from a collection of arrays. Refer
Expand All @@ -273,6 +279,8 @@ TREELITE_DLL int TreeliteLoadSKLearnRandomForestClassifier(
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \param out pointer to store the loaded model
Expand All @@ -281,8 +289,8 @@ TREELITE_DLL int TreeliteLoadSKLearnRandomForestClassifier(
TREELITE_DLL int TreeliteLoadSKLearnGradientBoostingRegressor(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity,
ModelHandle* out);
const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
const double** impurity, ModelHandle* out);

/*!
* \brief Load a scikit-learn gradient boosting classifier model from a collection of arrays. Refer
Expand All @@ -304,6 +312,8 @@ TREELITE_DLL int TreeliteLoadSKLearnGradientBoostingRegressor(
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \param out pointer to store the loaded model
Expand All @@ -313,7 +323,7 @@ TREELITE_DLL int TreeliteLoadSKLearnGradientBoostingClassifier(
int n_estimators, int n_features, int n_classes, const int64_t* node_count,
const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
const double** threshold, const double** value, const int64_t** n_node_samples,
const double** impurity, ModelHandle* out);
const double** weighted_n_node_samples, const double** impurity, ModelHandle* out);

/*!
* \brief Query the number of trees in the model
Expand Down
24 changes: 18 additions & 6 deletions include/treelite/frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,17 @@ std::unique_ptr<treelite::Model> LoadXGBoostJSONModelString(const char* json_str
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \return loaded model
*/
std::unique_ptr<treelite::Model> LoadSKLearnRandomForestRegressor(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity);
const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
const double** impurity);
/*!
* \brief Load a scikit-learn isolation forest model from a collection of arrays. Refer to
* https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
Expand All @@ -104,15 +107,17 @@ std::unique_ptr<treelite::Model> LoadSKLearnRandomForestRegressor(
* only defined if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity not used, but must be passed as array of arrays for each tree and node.
* \param ratio_c standardizing constant to use for calculation of the anomaly score.
* \return loaded model
*/
std::unique_ptr<treelite::Model> LoadSKLearnIsolationForest(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity,
const double ratio_c);
const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
const double** impurity, const double ratio_c);
/*!
* \brief Load a scikit-learn random forest classifier model from a collection of arrays. Refer to
* https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
Expand All @@ -134,6 +139,8 @@ std::unique_ptr<treelite::Model> LoadSKLearnIsolationForest(
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \return loaded model
Expand All @@ -142,7 +149,7 @@ std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifier(
int n_estimators, int n_features, int n_classes, const int64_t* node_count,
const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
const double** threshold, const double** value, const int64_t** n_node_samples,
const double** impurity);
const double** weighted_n_node_samples, const double** impurity);
/*!
* \brief Load a scikit-learn gradient boosting regressor model from a collection of arrays. Refer
* to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
Expand All @@ -162,14 +169,17 @@ std::unique_ptr<treelite::Model> LoadSKLearnRandomForestClassifier(
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \return loaded model
*/
std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingRegressor(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity);
const double** value, const int64_t** n_node_samples, const double** weighted_n_node_samples,
const double** impurity);
/*!
* \brief Load a scikit-learn gradient boosting classifier model from a collection of arrays. Refer
* to https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
Expand All @@ -190,6 +200,8 @@ std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingRegressor(
* if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param weighted_n_node_samples weighted_n_node_samples[i][k] stores the sum of weighted data
* samples associated with node k of the i-th tree.
* \param impurity impurity[i][k] stores the impurity measure (gini, entropy etc) associated with
* node k of the i-th tree.
* \return loaded model
Expand All @@ -198,7 +210,7 @@ std::unique_ptr<treelite::Model> LoadSKLearnGradientBoostingClassifier(
int n_estimators, int n_features, int n_classes, const int64_t* node_count,
const int64_t** children_left, const int64_t** children_right, const int64_t** feature,
const double** threshold, const double** value, const int64_t** n_node_samples,
const double** impurity);
const double** weighted_n_node_samples, const double** impurity);

//--------------------------------------------------------------------------
// model builder interface: build trees incrementally
Expand Down
19 changes: 11 additions & 8 deletions python/treelite/sklearn/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def import_model(sklearn_model):
threshold = ArrayOfArrays(dtype=np.float64)
value = ArrayOfArrays(dtype=np.float64)
n_node_samples = ArrayOfArrays(dtype=np.int64)
weighted_n_node_samples = ArrayOfArrays(dtype=np.float64)
impurity = ArrayOfArrays(dtype=np.float64)
for estimator in sklearn_model.estimators_:
if isinstance(sklearn_model, (GradientBoostingR, GradientBoostingC)):
Expand Down Expand Up @@ -170,6 +171,8 @@ def import_model(sklearn_model):
value.add(isolation_depths.reshape((-1,1,1)),
expected_shape=leaf_value_expected_shape(tree.node_count))
n_node_samples.add(tree.n_node_samples, expected_shape=(tree.node_count,))
weighted_n_node_samples.add(tree.weighted_n_node_samples,
expected_shape=(tree.node_count,))
impurity.add(tree.impurity, expected_shape=(tree.node_count,))

handle = ctypes.c_void_p()
Expand All @@ -178,36 +181,36 @@ def import_model(sklearn_model):
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
c_array(ctypes.c_int64, node_count), children_left.as_c_array(),
children_right.as_c_array(), feature.as_c_array(), threshold.as_c_array(),
value.as_c_array(), n_node_samples.as_c_array(), impurity.as_c_array(),
ctypes.byref(handle)))
value.as_c_array(), n_node_samples.as_c_array(), weighted_n_node_samples.as_c_array(),
impurity.as_c_array(), ctypes.byref(handle)))
elif isinstance(sklearn_model, IsolationForest):
_check_call(_LIB.TreeliteLoadSKLearnIsolationForest(
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
c_array(ctypes.c_int64, node_count), children_left.as_c_array(),
children_right.as_c_array(), feature.as_c_array(), threshold.as_c_array(),
value.as_c_array(), n_node_samples.as_c_array(), impurity.as_c_array(),
ctypes.c_double(ratio_c), ctypes.byref(handle)))
value.as_c_array(), n_node_samples.as_c_array(), weighted_n_node_samples.as_c_array(),
impurity.as_c_array(), ctypes.c_double(ratio_c), ctypes.byref(handle)))
elif isinstance(sklearn_model, (RandomForestC, ExtraTreesC)):
_check_call(_LIB.TreeliteLoadSKLearnRandomForestClassifier(
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
ctypes.c_int(sklearn_model.n_classes_), c_array(ctypes.c_int64, node_count),
children_left.as_c_array(), children_right.as_c_array(), feature.as_c_array(),
threshold.as_c_array(), value.as_c_array(), n_node_samples.as_c_array(),
impurity.as_c_array(), ctypes.byref(handle)))
weighted_n_node_samples.as_c_array(), impurity.as_c_array(), ctypes.byref(handle)))
elif isinstance(sklearn_model, GradientBoostingR):
_check_call(_LIB.TreeliteLoadSKLearnGradientBoostingRegressor(
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
c_array(ctypes.c_int64, node_count), children_left.as_c_array(),
children_right.as_c_array(), feature.as_c_array(), threshold.as_c_array(),
value.as_c_array(), n_node_samples.as_c_array(), impurity.as_c_array(),
ctypes.byref(handle)))
value.as_c_array(), n_node_samples.as_c_array(), weighted_n_node_samples.as_c_array(),
impurity.as_c_array(), ctypes.byref(handle)))
elif isinstance(sklearn_model, GradientBoostingC):
_check_call(_LIB.TreeliteLoadSKLearnGradientBoostingClassifier(
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
ctypes.c_int(sklearn_model.n_classes_), c_array(ctypes.c_int64, node_count),
children_left.as_c_array(), children_right.as_c_array(), feature.as_c_array(),
threshold.as_c_array(), value.as_c_array(), n_node_samples.as_c_array(),
impurity.as_c_array(), ctypes.byref(handle)))
weighted_n_node_samples.as_c_array(), impurity.as_c_array(), ctypes.byref(handle)))
else:
raise TreeliteError(f'Unsupported model type {sklearn_model.__class__.__name__}: ' +
'currently random forests, extremely randomized trees, and gradient ' +
Expand Down
Loading

0 comments on commit 5c6861a

Please sign in to comment.