Remove old RF backend (#3868)

Removes split_algo, use_experimental_backend parameters. Random forest classification and regression default n_bins is set to 128. Authors: - Rory Mitchell (https://github.com/RAMitchell) - Venkat (https://github.com/venkywonka) Approvers: - Thejaswi. N. S (https://github.com/teju85) - Vinay Deshpande (https://github.com/vinaydes) - Philip Hyunsu Cho (https://github.com/hcho3) - Dante Gama Dessavre (https://github.com/dantegd) URL: #3868
rapidsai · Jun 17, 2021 · 1fcad06 · 1fcad06
1 parent edecd3b
commit 1fcad06
Show file tree

Hide file tree

Showing 42 changed files with 837 additions and 5,817 deletions.
diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
@@ -150,19 +150,15 @@ std::vector<Params> getInputs() {
                        (1 << 20),          /* max_leaves */
                        1.f,                /* max_features */
                        32,                 /* n_bins */
-                       1,                  /* split_algo */
                        3,                  /* min_samples_leaf */
                        3,                  /* min_samples_split */
                        0.0f,               /* min_impurity_decrease */
-                       true,               /* bootstrap_features */
                        true,               /* bootstrap */
                        1,                  /* n_trees */
                        1.f,                /* max_samples */
                        1234ULL,            /* seed */
                        ML::CRITERION::MSE, /* split_criterion */
-                       false,              /* quantile_per_tree */
                        8,                  /* n_streams */
-                       false,              /* use_experimental_backend */
                        128                 /* max_batch_size */
   );
 

diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu
@@ -86,19 +86,15 @@ std::vector<Params> getInputs() {
                        (1 << 20),           /* max_leaves */
                        0.3,                 /* max_features */
                        32,                  /* n_bins */
-                       1,                   /* split_algo */
                        3,                   /* min_samples_leaf */
                        3,                   /* min_samples_split */
                        0.0f,                /* min_impurity_decrease */
-                       true,                /* bootstrap_features */
                        true,                /* bootstrap */
                        500,                 /* n_trees */
                        1.f,                 /* max_samples */
                        1234ULL,             /* seed */
                        ML::CRITERION::GINI, /* split_criterion */
-                       false,               /* quantile_per_tree */
                        8,                   /* n_streams */
-                       false,               /* use_experimental_backend */
                        128                  /* max_batch_size */
   );
 

diff --git a/cpp/bench/sg/rf_regressor.cu b/cpp/bench/sg/rf_regressor.cu
@@ -88,19 +88,15 @@ std::vector<RegParams> getInputs() {
                        (1 << 20),          /* max_leaves */
                        0.3,                /* max_features */
                        32,                 /* n_bins */
-                       1,                  /* split_algo */
                        3,                  /* min_samples_leaf */
                        3,                  /* min_samples_split */
                        0.0f,               /* min_impurity_decrease */
-                       true,               /* bootstrap_features */
                        true,               /* bootstrap */
                        500,                /* n_trees */
                        1.f,                /* max_samples */
                        1234ULL,            /* seed */
                        ML::CRITERION::MSE, /* split_criterion */
-                       false,              /* quantile_per_tree */
                        8,                  /* n_streams */
-                       false,              /* use_experimental_backend */
                        128                 /* max_batch_size */
   );
   std::vector<DimInfo> dim_info = {{500000, 500, 400}};

diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -181,12 +181,10 @@ RF_metrics score(const raft::handle_t& user_handle,
                  int verbosity = CUML_LEVEL_INFO);
 
 RF_params set_rf_params(int max_depth, int max_leaves, float max_features,
-                        int n_bins, int split_algo, int min_samples_leaf,
-                        int min_samples_split, float min_impurity_decrease,
-                        bool bootstrap_features, bool bootstrap, int n_trees,
-                        float max_samples, uint64_t seed,
-                        CRITERION split_criterion, bool quantile_per_tree,
-                        int cfg_n_streams, bool use_experimental_backend,
+                        int n_bins, int min_samples_leaf, int min_samples_split,
+                        float min_impurity_decrease, bool bootstrap,
+                        int n_trees, float max_samples, uint64_t seed,
+                        CRITERION split_criterion, int cfg_n_streams,
                         int max_batch_size);
 
 // ----------------------------- Regression ----------------------------------- //

diff --git a/cpp/include/cuml/tree/algo_helper.h b/cpp/include/cuml/tree/algo_helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,6 @@
 #pragma once
 
 namespace ML {
-enum SPLIT_ALGO {
-  HIST,
-  GLOBAL_QUANTILE,
-  SPLIT_ALGO_END,
-};
-
 enum CRITERION {
   GINI,
   ENTROPY,

diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp
@@ -44,10 +44,6 @@ struct DecisionTreeParams {
    * Number of bins used by the split algorithm.
    */
   int n_bins;
-  /**
-   * The split algorithm: HIST or GLOBAL_QUANTILE.
-   */
-  int split_algo;
   /**
    * The minimum number of samples (rows) in each leaf node.
    */
@@ -57,16 +53,7 @@ struct DecisionTreeParams {
    */
   int min_samples_split;
   /**
-   * Control bootstrapping for features. If features are drawn with or without replacement
-   */
-  bool bootstrap_features;
-  /**
-   * Whether a quantile needs to be computed for individual trees in RF.
-   * Default: compute quantiles once per RF. Only affects GLOBAL_QUANTILE split_algo.
-   */
-  bool quantile_per_tree;
-  /**
-   * Node split criterion. GINI and Entropy for classification, MSE or MAE for regression.
+   * Node split criterion. GINI and Entropy for classification, MSE for regression.
    */
   CRITERION split_criterion;
   /**
@@ -79,14 +66,6 @@ struct DecisionTreeParams {
    * used only for batched-level algo
    */
   int max_batch_size;
-  /**
-  * If set to true and following conditions are also met, experimental decision
-  *  tree training implementation would be used:
-  *     split_algo = 1 (GLOBAL_QUANTILE)
-  *     max_features = 1.0 (Feature sub-sampling disabled)
-  *     quantile_per_tree = false (No per tree quantile computation)
-  */
-  bool use_experimental_backend;
 };
 
 /**
@@ -96,33 +75,23 @@ struct DecisionTreeParams {
  * @param[in] cfg_max_leaves: maximum leaves; default -1
  * @param[in] cfg_max_features: maximum number of features; default 1.0f
  * @param[in] cfg_n_bins: number of bins; default 8
- * @param[in] cfg_split_algo: split algorithm; default SPLIT_ALGO::HIST
  * @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
  * @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
  *            default 2
  * @param[in] cfg_min_impurity_decrease: split a node only if its reduction in
  *                                       impurity is more than this value
- * @param[in] cfg_bootstrap_features: bootstrapping for features; default false
  * @param[in] cfg_split_criterion: split criterion; default CRITERION_END,
  *            i.e., GINI for classification or MSE for regression
- * @param[in] cfg_quantile_per_tree: compute quantile per tree; default false
- * @param[in] cfg_use_experimental_backend: When set to true, experimental batched
- *            backend is used (provided other conditions are met). Default is
-              True.
  * @param[in] cfg_max_batch_size: Maximum number of nodes that can be processed
               in a batch. This is used only for batched-level algo. Default
               value 128.
  */
 void set_tree_params(DecisionTreeParams &params, int cfg_max_depth = -1,
                      int cfg_max_leaves = -1, float cfg_max_features = 1.0f,
-                     int cfg_n_bins = 8, int cfg_split_algo = SPLIT_ALGO::HIST,
-                     int cfg_min_samples_leaf = 1,
+                     int cfg_n_bins = 128, int cfg_min_samples_leaf = 1,
                      int cfg_min_samples_split = 2,
                      float cfg_min_impurity_decrease = 0.0f,
-                     bool cfg_bootstrap_features = false,
                      CRITERION cfg_split_criterion = CRITERION_END,
-                     bool cfg_quantile_per_tree = false,
-                     bool cfg_use_experimental_backend = true,
                      int cfg_max_batch_size = 128);
 
 /**

diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/tree/decisiontree.hpp>
+#include <raft/handle.hpp>
 
 #include <cuml/tree/flatnode.h>
 #include "decisiontree_impl.cuh"
@@ -29,59 +31,25 @@ namespace DecisionTree {
  * @param[in] cfg_max_leaves: maximum leaves; default -1
  * @param[in] cfg_max_features: maximum number of features; default 1.0f
  * @param[in] cfg_n_bins: number of bins; default 8
- * @param[in] cfg_split_algo: split algorithm; default SPLIT_ALGO::HIST
  * @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
  * @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
  *            default 2
- * @param[in] cfg_bootstrap_features: bootstrapping for features; default false
  * @param[in] cfg_split_criterion: split criterion; default CRITERION_END,
  *            i.e., GINI for classification or MSE for regression
- * @param[in] cfg_quantile_per_tree: compute quantile per tree; default false
- * @param[in] cfg_use_experimental_backend: Switch to using experimental
-              backend; default false
  * @param[in] cfg_max_batch_size: batch size for experimental backend
  */
 void set_tree_params(DecisionTreeParams &params, int cfg_max_depth,
                      int cfg_max_leaves, float cfg_max_features, int cfg_n_bins,
-                     int cfg_split_algo, int cfg_min_samples_leaf,
-                     int cfg_min_samples_split, float cfg_min_impurity_decrease,
-                     bool cfg_bootstrap_features, CRITERION cfg_split_criterion,
-                     bool cfg_quantile_per_tree,
-                     bool cfg_use_experimental_backend,
-                     int cfg_max_batch_size) {
-  if (cfg_use_experimental_backend) {
-    if (cfg_split_algo != SPLIT_ALGO::GLOBAL_QUANTILE) {
-      CUML_LOG_WARN(
-        "Experimental backend does not yet support histogram split algorithm");
-      CUML_LOG_WARN(
-        "To use experimental backend set split_algo = 1 (GLOBAL_QUANTILE)");
-      cfg_use_experimental_backend = false;
-    }
-    if (cfg_quantile_per_tree) {
-      CUML_LOG_WARN(
-        "Experimental backend does not yet support per tree quantile "
-        "computation");
-      CUML_LOG_WARN(
-        "To use experimental backend set quantile_per_tree = false");
-      cfg_use_experimental_backend = false;
-    }
-    if (!cfg_use_experimental_backend) {
-      CUML_LOG_WARN(
-        "Not using the experimental backend due to above mentioned reason(s)");
-    }
-  }
-
+                     int cfg_min_samples_leaf, int cfg_min_samples_split,
+                     float cfg_min_impurity_decrease,
+                     CRITERION cfg_split_criterion, int cfg_max_batch_size) {
   params.max_depth = cfg_max_depth;
   params.max_leaves = cfg_max_leaves;
   params.max_features = cfg_max_features;
   params.n_bins = cfg_n_bins;
-  params.split_algo = cfg_split_algo;
   params.min_samples_leaf = cfg_min_samples_leaf;
   params.min_samples_split = cfg_min_samples_split;
-  params.bootstrap_features = cfg_bootstrap_features;
   params.split_criterion = cfg_split_criterion;
-  params.quantile_per_tree = cfg_quantile_per_tree;
-  params.use_experimental_backend = cfg_use_experimental_backend;
   params.min_impurity_decrease = cfg_min_impurity_decrease;
   params.max_batch_size = cfg_max_batch_size;
 }
@@ -95,10 +63,6 @@ void validity_check(const DecisionTreeParams params) {
          params.max_features);
   ASSERT((params.n_bins > 0), "Invalid n_bins %d", params.n_bins);
   ASSERT((params.split_criterion != 3), "MAE not supported.");
-  ASSERT((params.split_algo >= 0) &&
-           (params.split_algo < SPLIT_ALGO::SPLIT_ALGO_END),
-         "split_algo value %d outside permitted [0, %d) range",
-         params.split_algo, SPLIT_ALGO::SPLIT_ALGO_END);
   ASSERT((params.min_samples_leaf >= 1),
          "Invalid value for min_samples_leaf %d. Should be >= 1.",
          params.min_samples_leaf);
@@ -112,15 +76,10 @@ void print(const DecisionTreeParams params) {
   CUML_LOG_DEBUG("max_leaves: %d", params.max_leaves);
   CUML_LOG_DEBUG("max_features: %f", params.max_features);
   CUML_LOG_DEBUG("n_bins: %d", params.n_bins);
-  CUML_LOG_DEBUG("split_algo: %d", params.split_algo);
   CUML_LOG_DEBUG("min_samples_leaf: %d", params.min_samples_leaf);
   CUML_LOG_DEBUG("min_samples_split: %d", params.min_samples_split);
-  CUML_LOG_DEBUG("bootstrap_features: %d", params.bootstrap_features);
   CUML_LOG_DEBUG("split_criterion: %d", params.split_criterion);
-  CUML_LOG_DEBUG("quantile_per_tree: %d", params.quantile_per_tree);
   CUML_LOG_DEBUG("min_impurity_decrease: %f", params.min_impurity_decrease);
-  CUML_LOG_DEBUG("use_experimental_backend: %s",
-                 params.use_experimental_backend ? "True" : "False");
   CUML_LOG_DEBUG("max_batch_size: %d", params.max_batch_size);
 }
 
@@ -159,21 +118,15 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
                                uint64_t seed) {
   std::shared_ptr<DecisionTreeClassifier<float>> dt_classifier =
     std::make_shared<DecisionTreeClassifier<float>>();
-  std::unique_ptr<MLCommon::device_buffer<float>> global_quantiles_buffer =
-    nullptr;
-  float *global_quantiles = nullptr;
-
-  if (tree_params.use_experimental_backend) {
-    auto quantile_size = tree_params.n_bins * ncols;
-    global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<float>>(
-      handle.get_device_allocator(), handle.get_stream(), quantile_size);
-    global_quantiles = global_quantiles_buffer->data();
-    DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
-                                   nrows, ncols, handle.get_device_allocator(),
-                                   handle.get_stream());
-  }
+  auto quantile_size = tree_params.n_bins * ncols;
+  MLCommon::device_buffer<float> global_quantiles_buffer(
+    handle.get_device_allocator(), handle.get_stream(), quantile_size);
+  DecisionTree::computeQuantiles(
+    global_quantiles_buffer.data(), tree_params.n_bins, data, nrows, ncols,
+    handle.get_device_allocator(), handle.get_stream());
   dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                     unique_labels, tree, tree_params, seed, global_quantiles);
+                     unique_labels, tree, tree_params, seed,
+                     global_quantiles_buffer.data());
 }
 
 void decisionTreeClassifierFit(const raft::handle_t &handle,
@@ -185,21 +138,16 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
                                uint64_t seed) {
   std::shared_ptr<DecisionTreeClassifier<double>> dt_classifier =
     std::make_shared<DecisionTreeClassifier<double>>();
-  std::unique_ptr<MLCommon::device_buffer<double>> global_quantiles_buffer =
-    nullptr;
-  double *global_quantiles = nullptr;
 
-  if (tree_params.use_experimental_backend) {
-    auto quantile_size = tree_params.n_bins * ncols;
-    global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<double>>(
-      handle.get_device_allocator(), handle.get_stream(), quantile_size);
-    global_quantiles = global_quantiles_buffer->data();
-    DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
-                                   nrows, ncols, handle.get_device_allocator(),
-                                   handle.get_stream());
-  }
+  auto quantile_size = tree_params.n_bins * ncols;
+  MLCommon::device_buffer<double> global_quantiles_buffer(
+    handle.get_device_allocator(), handle.get_stream(), quantile_size);
+  DecisionTree::computeQuantiles(
+    global_quantiles_buffer.data(), tree_params.n_bins, data, nrows, ncols,
+    handle.get_device_allocator(), handle.get_stream());
   dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                     unique_labels, tree, tree_params, seed, global_quantiles);
+                     unique_labels, tree, tree_params, seed,
+                     global_quantiles_buffer.data());
 }
 
 void decisionTreeClassifierPredict(const raft::handle_t &handle,
@@ -234,21 +182,14 @@ void decisionTreeRegressorFit(const raft::handle_t &handle,
                               uint64_t seed) {
   std::shared_ptr<DecisionTreeRegressor<float>> dt_regressor =
     std::make_shared<DecisionTreeRegressor<float>>();
-  std::unique_ptr<MLCommon::device_buffer<float>> global_quantiles_buffer =
-    nullptr;
-  float *global_quantiles = nullptr;
-
-  if (tree_params.use_experimental_backend) {
-    auto quantile_size = tree_params.n_bins * ncols;
-    global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<float>>(
-      handle.get_device_allocator(), handle.get_stream(), quantile_size);
-    global_quantiles = global_quantiles_buffer->data();
-    DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
-                                   nrows, ncols, handle.get_device_allocator(),
-                                   handle.get_stream());
-  }
+  auto quantile_size = tree_params.n_bins * ncols;
+  MLCommon::device_buffer<float> global_quantiles(
+    handle.get_device_allocator(), handle.get_stream(), quantile_size);
+  DecisionTree::computeQuantiles(
+    global_quantiles.data(), tree_params.n_bins, data, nrows, ncols,
+    handle.get_device_allocator(), handle.get_stream());
   dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                    tree, tree_params, seed, global_quantiles);
+                    tree, tree_params, seed, global_quantiles.data());
 }
 
 void decisionTreeRegressorFit(const raft::handle_t &handle,
@@ -259,21 +200,15 @@ void decisionTreeRegressorFit(const raft::handle_t &handle,
                               uint64_t seed) {
   std::shared_ptr<DecisionTreeRegressor<double>> dt_regressor =
     std::make_shared<DecisionTreeRegressor<double>>();
-  std::unique_ptr<MLCommon::device_buffer<double>> global_quantiles_buffer =
-    nullptr;
-  double *global_quantiles = nullptr;
 
-  if (tree_params.use_experimental_backend) {
-    auto quantile_size = tree_params.n_bins * ncols;
-    global_quantiles_buffer = std::make_unique<MLCommon::device_buffer<double>>(
-      handle.get_device_allocator(), handle.get_stream(), quantile_size);
-    global_quantiles = global_quantiles_buffer->data();
-    DecisionTree::computeQuantiles(global_quantiles, tree_params.n_bins, data,
-                                   nrows, ncols, handle.get_device_allocator(),
-                                   handle.get_stream());
-  }
+  auto quantile_size = tree_params.n_bins * ncols;
+  MLCommon::device_buffer<double> global_quantiles(
+    handle.get_device_allocator(), handle.get_stream(), quantile_size);
+  DecisionTree::computeQuantiles(
+    global_quantiles.data(), tree_params.n_bins, data, nrows, ncols,
+    handle.get_device_allocator(), handle.get_stream());
   dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                    tree, tree_params, seed, global_quantiles);
+                    tree, tree_params, seed, global_quantiles.data());
 }
 
 void decisionTreeRegressorPredict(const raft::handle_t &handle,