Skip to content

Commit

Permalink
RF: Variable binning and other minor refactoring (rapidsai#4479)
Browse files Browse the repository at this point in the history
* This PR enables variable bins capped to `max_n_bins` for the feature-quantiles. This makes Decision Trees more robust for a wider variety of datasets by avoiding redundant bins for columns having fewer uniques.
* Added tests for the same
* Some accompanying changes in naming and format of the structures used for passing input and quantiles
* Deleting the `cpp/test/sg/decisiontree_batchedlevel_*` files as they are not tested.
* changed param `n_bins` to `max_nbins` in C++ layer to differentiate it's meaning with the actual `n_bins` used [here](https://github.com/rapidsai/cuml/blob/eb62fecce4211a1022cf19380be31981680fc5ab/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels_impl.cuh#L266)
* The python layer still maintains `n_bins` but have tweaked the docstrings to convey that it denotes the "_maximum bins used_"
* Other variable renamings in the core Decision Tree classes
---

This PR does not improve perf of GBM-bench datasets by much as almost all of the features have unique values that exceed the `n_bins` used.

![comparison_gbm_main_vs_test](https://user-images.githubusercontent.com/23023424/149161138-9dc1cfea-9890-4f96-8eef-8b938e44d10c.png)

Authors:
  - Venkat (https://github.com/venkywonka)

Approvers:
  - Rory Mitchell (https://github.com/RAMitchell)
  - Vinay Deshpande (https://github.com/vinaydes)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: rapidsai#4479
  • Loading branch information
venkywonka authored Feb 3, 2022
1 parent daa0a9f commit 204ea94
Show file tree
Hide file tree
Showing 37 changed files with 636 additions and 1,020 deletions.
2 changes: 1 addition & 1 deletion cpp/bench/sg/fil.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ std::vector<Params> getInputs()
p.rf = set_rf_params(10, /*max_depth */
(1 << 20), /* max_leaves */
1.f, /* max_features */
32, /* n_bins */
32, /* max_n_bins */
3, /* min_samples_leaf */
3, /* min_samples_split */
0.0f, /* min_impurity_decrease */
Expand Down
2 changes: 1 addition & 1 deletion cpp/bench/sg/rf_classifier.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ std::vector<Params> getInputs()
p.rf = set_rf_params(10, /*max_depth */
(1 << 20), /* max_leaves */
0.3, /* max_features */
32, /* n_bins */
32, /* max_n_bins */
3, /* min_samples_leaf */
3, /* min_samples_split */
0.0f, /* min_impurity_decrease */
Expand Down
2 changes: 1 addition & 1 deletion cpp/bench/sg/rf_regressor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ std::vector<RegParams> getInputs()
p.rf = set_rf_params(10, /*max_depth */
(1 << 20), /* max_leaves */
0.3, /* max_features */
32, /* n_bins */
32, /* max_n_bins */
3, /* min_samples_leaf */
3, /* min_samples_split */
0.0f, /* min_impurity_decrease */
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cuml/ensemble/randomforest.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -191,7 +191,7 @@ RF_metrics score(const raft::handle_t& user_handle,
RF_params set_rf_params(int max_depth,
int max_leaves,
float max_features,
int n_bins,
int max_n_bins,
int min_samples_leaf,
int min_samples_split,
float min_impurity_decrease,
Expand Down
12 changes: 6 additions & 6 deletions cpp/include/cuml/tree/decisiontree.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,9 +40,9 @@ struct DecisionTreeParams {
*/
float max_features;
/**
* Number of bins used by the split algorithm.
* maximum number of bins used by the split algorithm per feature.
*/
int n_bins;
int max_n_bins;
/**
* The minimum number of samples (rows) in each leaf node.
*/
Expand Down Expand Up @@ -74,7 +74,7 @@ struct DecisionTreeParams {
* @param[in] cfg_max_depth: maximum tree depth; default -1
* @param[in] cfg_max_leaves: maximum leaves; default -1
* @param[in] cfg_max_features: maximum number of features; default 1.0f
* @param[in] cfg_n_bins: number of bins; default 8
* @param[in] cfg_max_n_bins: maximum number of bins; default 128
* @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
* @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
* default 2
Expand All @@ -84,13 +84,13 @@ struct DecisionTreeParams {
* i.e., GINI for classification or MSE for regression
* @param[in] cfg_max_batch_size: Maximum number of nodes that can be processed
in a batch. This is used only for batched-level algo. Default
value 128.
value 4096.
*/
void set_tree_params(DecisionTreeParams& params,
int cfg_max_depth = -1,
int cfg_max_leaves = -1,
float cfg_max_features = 1.0f,
int cfg_n_bins = 128,
int cfg_max_n_bins = 128,
int cfg_min_samples_leaf = 1,
int cfg_min_samples_split = 2,
float cfg_min_impurity_decrease = 0.0f,
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/decisiontree/batched-levelalgo/bins.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,9 +25,9 @@ struct CountBin {
HDI CountBin(int x_) : x(x_) {}
HDI CountBin() : x(0) {}

DI static void IncrementHistogram(CountBin* hist, int nbins, int b, int label)
DI static void IncrementHistogram(CountBin* hist, int n_bins, int b, int label)
{
auto offset = label * nbins + b;
auto offset = label * n_bins + b;
CountBin::AtomicAdd(hist + offset, {1});
}
DI static void AtomicAdd(CountBin* address, CountBin val) { atomicAdd(&address->x, val.x); }
Expand All @@ -51,7 +51,7 @@ struct AggregateBin {
HDI AggregateBin() : label_sum(0.0), count(0) {}
HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {}

DI static void IncrementHistogram(AggregateBin* hist, int nbins, int b, double label)
DI static void IncrementHistogram(AggregateBin* hist, int n_bins, int b, double label)
{
AggregateBin::AtomicAdd(hist + b, {label, 1});
}
Expand Down
Loading

0 comments on commit 204ea94

Please sign in to comment.