Skip to content

Commit

Permalink
Remove old RF backend (#3868)
Browse files Browse the repository at this point in the history
Removes split_algo, use_experimental_backend parameters. Random forest classification and regression default n_bins is set to 128.

Authors:
  - Rory Mitchell (https://github.com/RAMitchell)
  - Venkat (https://github.com/venkywonka)

Approvers:
  - Thejaswi. N. S (https://github.com/teju85)
  - Vinay Deshpande (https://github.com/vinaydes)
  - Philip Hyunsu Cho (https://github.com/hcho3)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #3868
  • Loading branch information
RAMitchell authored Jun 17, 2021
1 parent edecd3b commit 1fcad06
Show file tree
Hide file tree
Showing 42 changed files with 837 additions and 5,817 deletions.
4 changes: 0 additions & 4 deletions cpp/bench/sg/fil.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,19 +150,15 @@ std::vector<Params> getInputs() {
(1 << 20), /* max_leaves */
1.f, /* max_features */
32, /* n_bins */
1, /* split_algo */
3, /* min_samples_leaf */
3, /* min_samples_split */
0.0f, /* min_impurity_decrease */
true, /* bootstrap_features */
true, /* bootstrap */
1, /* n_trees */
1.f, /* max_samples */
1234ULL, /* seed */
ML::CRITERION::MSE, /* split_criterion */
false, /* quantile_per_tree */
8, /* n_streams */
false, /* use_experimental_backend */
128 /* max_batch_size */
);

Expand Down
4 changes: 0 additions & 4 deletions cpp/bench/sg/rf_classifier.cu
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,15 @@ std::vector<Params> getInputs() {
(1 << 20), /* max_leaves */
0.3, /* max_features */
32, /* n_bins */
1, /* split_algo */
3, /* min_samples_leaf */
3, /* min_samples_split */
0.0f, /* min_impurity_decrease */
true, /* bootstrap_features */
true, /* bootstrap */
500, /* n_trees */
1.f, /* max_samples */
1234ULL, /* seed */
ML::CRITERION::GINI, /* split_criterion */
false, /* quantile_per_tree */
8, /* n_streams */
false, /* use_experimental_backend */
128 /* max_batch_size */
);

Expand Down
4 changes: 0 additions & 4 deletions cpp/bench/sg/rf_regressor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,19 +88,15 @@ std::vector<RegParams> getInputs() {
(1 << 20), /* max_leaves */
0.3, /* max_features */
32, /* n_bins */
1, /* split_algo */
3, /* min_samples_leaf */
3, /* min_samples_split */
0.0f, /* min_impurity_decrease */
true, /* bootstrap_features */
true, /* bootstrap */
500, /* n_trees */
1.f, /* max_samples */
1234ULL, /* seed */
ML::CRITERION::MSE, /* split_criterion */
false, /* quantile_per_tree */
8, /* n_streams */
false, /* use_experimental_backend */
128 /* max_batch_size */
);
std::vector<DimInfo> dim_info = {{500000, 500, 400}};
Expand Down
10 changes: 4 additions & 6 deletions cpp/include/cuml/ensemble/randomforest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,10 @@ RF_metrics score(const raft::handle_t& user_handle,
int verbosity = CUML_LEVEL_INFO);

RF_params set_rf_params(int max_depth, int max_leaves, float max_features,
int n_bins, int split_algo, int min_samples_leaf,
int min_samples_split, float min_impurity_decrease,
bool bootstrap_features, bool bootstrap, int n_trees,
float max_samples, uint64_t seed,
CRITERION split_criterion, bool quantile_per_tree,
int cfg_n_streams, bool use_experimental_backend,
int n_bins, int min_samples_leaf, int min_samples_split,
float min_impurity_decrease, bool bootstrap,
int n_trees, float max_samples, uint64_t seed,
CRITERION split_criterion, int cfg_n_streams,
int max_batch_size);

// ----------------------------- Regression ----------------------------------- //
Expand Down
8 changes: 1 addition & 7 deletions cpp/include/cuml/tree/algo_helper.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,12 +17,6 @@
#pragma once

namespace ML {
enum SPLIT_ALGO {
HIST,
GLOBAL_QUANTILE,
SPLIT_ALGO_END,
};

enum CRITERION {
GINI,
ENTROPY,
Expand Down
35 changes: 2 additions & 33 deletions cpp/include/cuml/tree/decisiontree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@ struct DecisionTreeParams {
* Number of bins used by the split algorithm.
*/
int n_bins;
/**
* The split algorithm: HIST or GLOBAL_QUANTILE.
*/
int split_algo;
/**
* The minimum number of samples (rows) in each leaf node.
*/
Expand All @@ -57,16 +53,7 @@ struct DecisionTreeParams {
*/
int min_samples_split;
/**
* Control bootstrapping for features. If features are drawn with or without replacement
*/
bool bootstrap_features;
/**
* Whether a quantile needs to be computed for individual trees in RF.
* Default: compute quantiles once per RF. Only affects GLOBAL_QUANTILE split_algo.
*/
bool quantile_per_tree;
/**
* Node split criterion. GINI and Entropy for classification, MSE or MAE for regression.
* Node split criterion. GINI and Entropy for classification, MSE for regression.
*/
CRITERION split_criterion;
/**
Expand All @@ -79,14 +66,6 @@ struct DecisionTreeParams {
* used only for batched-level algo
*/
int max_batch_size;
/**
* If set to true and following conditions are also met, experimental decision
* tree training implementation would be used:
* split_algo = 1 (GLOBAL_QUANTILE)
* max_features = 1.0 (Feature sub-sampling disabled)
* quantile_per_tree = false (No per tree quantile computation)
*/
bool use_experimental_backend;
};

/**
Expand All @@ -96,33 +75,23 @@ struct DecisionTreeParams {
* @param[in] cfg_max_leaves: maximum leaves; default -1
* @param[in] cfg_max_features: maximum number of features; default 1.0f
* @param[in] cfg_n_bins: number of bins; default 8
* @param[in] cfg_split_algo: split algorithm; default SPLIT_ALGO::HIST
* @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
* @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
* default 2
* @param[in] cfg_min_impurity_decrease: split a node only if its reduction in
* impurity is more than this value
* @param[in] cfg_bootstrap_features: bootstrapping for features; default false
* @param[in] cfg_split_criterion: split criterion; default CRITERION_END,
* i.e., GINI for classification or MSE for regression
* @param[in] cfg_quantile_per_tree: compute quantile per tree; default false
* @param[in] cfg_use_experimental_backend: When set to true, experimental batched
* backend is used (provided other conditions are met). Default is
True.
* @param[in] cfg_max_batch_size: Maximum number of nodes that can be processed
in a batch. This is used only for batched-level algo. Default
value 128.
*/
void set_tree_params(DecisionTreeParams &params, int cfg_max_depth = -1,
int cfg_max_leaves = -1, float cfg_max_features = 1.0f,
int cfg_n_bins = 8, int cfg_split_algo = SPLIT_ALGO::HIST,
int cfg_min_samples_leaf = 1,
int cfg_n_bins = 128, int cfg_min_samples_leaf = 1,
int cfg_min_samples_split = 2,
float cfg_min_impurity_decrease = 0.0f,
bool cfg_bootstrap_features = false,
CRITERION cfg_split_criterion = CRITERION_END,
bool cfg_quantile_per_tree = false,
bool cfg_use_experimental_backend = true,
int cfg_max_batch_size = 128);

/**
Expand Down
135 changes: 35 additions & 100 deletions cpp/src/decisiontree/decisiontree.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
* limitations under the License.
*/

#include <cuml/common/device_buffer.hpp>
#include <cuml/tree/decisiontree.hpp>
#include <raft/handle.hpp>

#include <cuml/tree/flatnode.h>
#include "decisiontree_impl.cuh"
Expand All @@ -29,59 +31,25 @@ namespace DecisionTree {
* @param[in] cfg_max_leaves: maximum leaves; default -1
* @param[in] cfg_max_features: maximum number of features; default 1.0f
* @param[in] cfg_n_bins: number of bins; default 8
* @param[in] cfg_split_algo: split algorithm; default SPLIT_ALGO::HIST
* @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
* @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
* default 2
* @param[in] cfg_bootstrap_features: bootstrapping for features; default false
* @param[in] cfg_split_criterion: split criterion; default CRITERION_END,
* i.e., GINI for classification or MSE for regression
* @param[in] cfg_quantile_per_tree: compute quantile per tree; default false
* @param[in] cfg_use_experimental_backend: Switch to using experimental
backend; default false
* @param[in] cfg_max_batch_size: batch size for experimental backend
*/
void set_tree_params(DecisionTreeParams &params, int cfg_max_depth,
int cfg_max_leaves, float cfg_max_features, int cfg_n_bins,
int cfg_split_algo, int cfg_min_samples_leaf,
int cfg_min_samples_split, float cfg_min_impurity_decrease,
bool cfg_bootstrap_features, CRITERION cfg_split_criterion,
bool cfg_quantile_per_tree,
bool cfg_use_experimental_backend,
int cfg_max_batch_size) {
if (cfg_use_experimental_backend) {
if (cfg_split_algo != SPLIT_ALGO::GLOBAL_QUANTILE) {
CUML_LOG_WARN(
"Experimental backend does not yet support histogram split algorithm");
CUML_LOG_WARN(
"To use experimental backend set split_algo = 1 (GLOBAL_QUANTILE)");
cfg_use_experimental_backend = false;
}
if (cfg_quantile_per_tree) {
CUML_LOG_WARN(
"Experimental backend does not yet support per tree quantile "
"computation");
CUML_LOG_WARN(
"To use experimental backend set quantile_per_tree = false");
cfg_use_experimental_backend = false;
}
if (!cfg_use_experimental_backend) {
CUML_LOG_WARN(
"Not using the experimental backend due to above mentioned reason(s)");
}
}

int cfg_min_samples_leaf, int cfg_min_samples_split,
float cfg_min_impurity_decrease,
CRITERION cfg_split_criterion, int cfg_max_batch_size) {
params.max_depth = cfg_max_depth;
params.max_leaves = cfg_max_leaves;
params.max_features = cfg_max_features;
params.n_bins = cfg_n_bins;
params.split_algo = cfg_split_algo;
params.min_samples_leaf = cfg_min_samples_leaf;
params.min_samples_split = cfg_min_samples_split;
params.bootstrap_features = cfg_bootstrap_features;
params.split_criterion = cfg_split_criterion;
params.quantile_per_tree = cfg_quantile_per_tree;
params.use_experimental_backend = cfg_use_experimental_backend;
params.min_impurity_decrease = cfg_min_impurity_decrease;
params.max_batch_size = cfg_max_batch_size;
}
Expand All @@ -95,10 +63,6 @@ void validity_check(const DecisionTreeParams params) {
params.max_features);
ASSERT((params.n_bins > 0), "Invalid n_bins %d", params.n_bins);
ASSERT((params.split_criterion != 3), "MAE not supported.");
ASSERT((params.split_algo >= 0) &&
(params.split_algo < SPLIT_ALGO::SPLIT_ALGO_END),
"split_algo value %d outside permitted [0, %d) range",
params.split_algo, SPLIT_ALGO::SPLIT_ALGO_END);
ASSERT((params.min_samples_leaf >= 1),
"Invalid value for min_samples_leaf %d. Should be >= 1.",
params.min_samples_leaf);
Expand All @@ -112,15 +76,10 @@ void print(const DecisionTreeParams params) {
CUML_LOG_DEBUG("max_leaves: %d", params.max_leaves);
CUML_LOG_DEBUG("max_features: %f", params.max_features);
CUML_LOG_DEBUG("n_bins: %d", params.n_bins);
CUML_LOG_DEBUG("split_algo: %d", params.split_algo);
CUML_LOG_DEBUG("min_samples_leaf: %d", params.min_samples_leaf);
CUML_LOG_DEBUG("min_samples_split: %d", params.min_samples_split);
CUML_LOG_DEBUG("bootstrap_features: %d", params.bootstrap_features);
CUML_LOG_DEBUG("split_criterion: %d", params.split_criterion);
CUML_LOG_DEBUG("quantile_per_tree: %d", params.quantile_per_tree);
CUML_LOG_DEBUG("min_impurity_decrease: %f", params.min_impurity_decrease);
CUML_LOG_DEBUG("use_experimental_backend: %s",
params.use_experimental_backend ? "True" : "False");
CUML_LOG_DEBUG("max_batch_size: %d", params.max_batch_size);
}

Expand Down Expand Up @@ -159,21 +118,15 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
uint64_t seed) {
std::shared_ptr<DecisionTreeClassifier<float>> dt_classifier =
std::make_shared<DecisionTreeClassifier<float>>();
std::unique_ptr<MLCommon::device_buffer<float>> global_quantiles_buffer =
nullptr;
float *global_quantiles = nullptr;

if (tree_params.use_experimental_backend) {
auto quantile_size = tree_params.n_bins * ncols;
global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<float>>(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
global_quantiles = global_quantiles_buffer->data();
DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
nrows, ncols, handle.get_device_allocator(),
handle.get_stream());
}
auto quantile_size = tree_params.n_bins * ncols;
MLCommon::device_buffer<float> global_quantiles_buffer(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
DecisionTree::computeQuantiles(
global_quantiles_buffer.data(), tree_params.n_bins, data, nrows, ncols,
handle.get_device_allocator(), handle.get_stream());
dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
unique_labels, tree, tree_params, seed, global_quantiles);
unique_labels, tree, tree_params, seed,
global_quantiles_buffer.data());
}

void decisionTreeClassifierFit(const raft::handle_t &handle,
Expand All @@ -185,21 +138,16 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
uint64_t seed) {
std::shared_ptr<DecisionTreeClassifier<double>> dt_classifier =
std::make_shared<DecisionTreeClassifier<double>>();
std::unique_ptr<MLCommon::device_buffer<double>> global_quantiles_buffer =
nullptr;
double *global_quantiles = nullptr;

if (tree_params.use_experimental_backend) {
auto quantile_size = tree_params.n_bins * ncols;
global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<double>>(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
global_quantiles = global_quantiles_buffer->data();
DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
nrows, ncols, handle.get_device_allocator(),
handle.get_stream());
}
auto quantile_size = tree_params.n_bins * ncols;
MLCommon::device_buffer<double> global_quantiles_buffer(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
DecisionTree::computeQuantiles(
global_quantiles_buffer.data(), tree_params.n_bins, data, nrows, ncols,
handle.get_device_allocator(), handle.get_stream());
dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
unique_labels, tree, tree_params, seed, global_quantiles);
unique_labels, tree, tree_params, seed,
global_quantiles_buffer.data());
}

void decisionTreeClassifierPredict(const raft::handle_t &handle,
Expand Down Expand Up @@ -234,21 +182,14 @@ void decisionTreeRegressorFit(const raft::handle_t &handle,
uint64_t seed) {
std::shared_ptr<DecisionTreeRegressor<float>> dt_regressor =
std::make_shared<DecisionTreeRegressor<float>>();
std::unique_ptr<MLCommon::device_buffer<float>> global_quantiles_buffer =
nullptr;
float *global_quantiles = nullptr;

if (tree_params.use_experimental_backend) {
auto quantile_size = tree_params.n_bins * ncols;
global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<float>>(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
global_quantiles = global_quantiles_buffer->data();
DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
nrows, ncols, handle.get_device_allocator(),
handle.get_stream());
}
auto quantile_size = tree_params.n_bins * ncols;
MLCommon::device_buffer<float> global_quantiles(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
DecisionTree::computeQuantiles(
global_quantiles.data(), tree_params.n_bins, data, nrows, ncols,
handle.get_device_allocator(), handle.get_stream());
dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
tree, tree_params, seed, global_quantiles);
tree, tree_params, seed, global_quantiles.data());
}

void decisionTreeRegressorFit(const raft::handle_t &handle,
Expand All @@ -259,21 +200,15 @@ void decisionTreeRegressorFit(const raft::handle_t &handle,
uint64_t seed) {
std::shared_ptr<DecisionTreeRegressor<double>> dt_regressor =
std::make_shared<DecisionTreeRegressor<double>>();
std::unique_ptr<MLCommon::device_buffer<double>> global_quantiles_buffer =
nullptr;
double *global_quantiles = nullptr;

if (tree_params.use_experimental_backend) {
auto quantile_size = tree_params.n_bins * ncols;
global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<double>>(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
global_quantiles = global_quantiles_buffer->data();
DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
nrows, ncols, handle.get_device_allocator(),
handle.get_stream());
}
auto quantile_size = tree_params.n_bins * ncols;
MLCommon::device_buffer<double> global_quantiles(
handle.get_device_allocator(), handle.get_stream(), quantile_size);
DecisionTree::computeQuantiles(
global_quantiles.data(), tree_params.n_bins, data, nrows, ncols,
handle.get_device_allocator(), handle.get_stream());
dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
tree, tree_params, seed, global_quantiles);
tree, tree_params, seed, global_quantiles.data());
}

void decisionTreeRegressorPredict(const raft::handle_t &handle,
Expand Down
Loading

0 comments on commit 1fcad06

Please sign in to comment.