rapidsai · rapids-bot · Apr 7, 2021 · Mar 31, 2021 · Apr 1, 2021 · Apr 2, 2021
@@ -100,7 +100,7 @@ class RFClassifierAccuracyTest : public ::testing::TestWithParam<RFInputs> {
                         sc,    /* split_criterion */
                         false, /* quantile_per_tree */
                         1,     /* n_streams */
-                        false, /* use_experimental_backend */
+                        true,  /* use_experimental_backend */
                         128    /* max_batch_size */
     );
   }

@@ -50,7 +50,7 @@ class RfClassifierDepthTest : public ::testing::TestWithParam<int> {
  protected:
   void basicTest() {
     const int max_depth = ::testing::TestWithParam<int>::GetParam();
-    params = RfInputs<T>{5000,
+    params = RfInputs<T>{10000,
                          10,
                          1,
                          1.0f,
@@ -73,7 +73,7 @@ class RfClassifierDepthTest : public ::testing::TestWithParam<int> {
       params.split_algo, params.min_samples_leaf, params.min_samples_split,
       params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
       params.n_trees, params.max_samples, 0, params.split_criterion, false,
-      params.n_streams, false, 128);
+      params.n_streams, true, 128);
 
     int data_len = params.n_rows * params.n_cols;
     raft::allocate(data, data_len);
@@ -165,7 +165,7 @@ class RfRegressorDepthTest : public ::testing::TestWithParam<int> {
       params.split_algo, params.min_samples_leaf, params.min_samples_split,
       params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
       params.n_trees, params.max_samples, 0, params.split_criterion, false,
-      params.n_streams, false, 128);
+      params.n_streams, true, 128);
 
     int data_len = params.n_rows * params.n_cols;
     raft::allocate(data, data_len);

@@ -62,7 +62,7 @@ class RfClassifierTest : public ::testing::TestWithParam<RfInputs<T>> {
       params.split_algo, params.min_samples_leaf, params.min_samples_split,
       params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
       params.n_trees, params.max_samples, 0, params.split_criterion, false,
-      params.n_streams, false, 128);
+      params.n_streams, true, 128);
 
     //--------------------------------------------------------
     // Random Forest
@@ -258,7 +258,7 @@ const std::vector<RfInputs<float>> inputsf2_clf = {
    CRITERION::
      GINI},  //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
-   2, 2, 0.0, 2,
+   1, 2, 0.0, 1,
    CRITERION::
      CRITERION_END},  //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins, different split algorithm
   {4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, false, 4, SPLIT_ALGO::HIST, 2, 2, 0.0,
@@ -270,7 +270,7 @@ const std::vector<RfInputs<float>> inputsf2_clf = {
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::HIST, 2, 2, 0.0,
    2, CRITERION::ENTROPY},
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
-   2, 2, 0.0, 2, CRITERION::ENTROPY},
+   1, 2, 0.0, 2, CRITERION::ENTROPY},
   {50, 10, 10, 0.8f, 0.8f, 10, 7, -1, true, true, 3,
    SPLIT_ALGO::GLOBAL_QUANTILE, 2, 2, 0.0, 2, CRITERION::ENTROPY}};
 
@@ -284,7 +284,7 @@ const std::vector<RfInputs<double>> inputsd2_clf = {  // Same as inputsf2_clf
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::HIST, 2, 2, 0.0,
    2, CRITERION::GINI},
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
-   2, 2, 0.0, 2, CRITERION::CRITERION_END},
+   1, 2, 0.0, 2, CRITERION::CRITERION_END},
   {4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, false, 4, SPLIT_ALGO::HIST, 2, 2, 0.0,
    2, CRITERION::ENTROPY},
   {4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, false, 4, SPLIT_ALGO::HIST, 2, 2, 0.0,
@@ -294,7 +294,7 @@ const std::vector<RfInputs<double>> inputsd2_clf = {  // Same as inputsf2_clf
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::HIST, 2, 2, 0.0,
    2, CRITERION::ENTROPY},
   {4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
-   2, 2, 0.0, 2, CRITERION::ENTROPY},
+   1, 2, 0.0, 2, CRITERION::ENTROPY},
   {50, 10, 10, 0.8f, 0.8f, 10, 7, -1, true, true, 3,
    SPLIT_ALGO::GLOBAL_QUANTILE, 2, 2, 0.0, 2, CRITERION::ENTROPY}};
 

@@ -188,7 +188,7 @@ class RfTreeliteTestCommon : public ::testing::TestWithParam<RfInputs<T>> {
       params.split_algo, params.min_samples_leaf, params.min_samples_split,
       params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
       params.n_trees, params.max_samples, 0, params.split_criterion, false,
-      params.n_streams, false, 128);
+      params.n_streams, true, 128);
 
     handle.reset(new raft::handle_t(rf_params.n_streams));
 

@@ -120,6 +120,14 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
     quantile_per_tree : boolean (default = False)
         Whether quantile is computed for individual RF trees.
         Only relevant for GLOBAL_QUANTILE split_algo.
+    use_experimental_backend : boolean (default = True)
+        If set to true and the following conditions are also met, a new
+        experimental backend for decision tree training will be used. The
+        new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
+        and `quantile_per_tree = False` (No per tree quantile computation).
+        The new backend is considered stable for classification tasks but
+        not yet for the regression task. The RAPIDS team is continuing
+        optimization and evaluation of the new backend for regression tasks.
     n_streams : int (default = 4 )
         Number of parallel streams used for forest building
     workers : optional, list of strings

@@ -123,6 +123,17 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
         for median of abs error : 'median_ae'
         for mean of abs error : 'mean_ae'
         for mean square error' : 'mse'
+    quantile_per_tree : boolean (default = False)
+        Whether quantile is computed for individual RF trees.
+        Only relevant for GLOBAL_QUANTILE split_algo.
+    use_experimental_backend : boolean (default = False)
+        If set to true and the following conditions are also met, a new
+        experimental backend for decision tree training will be used. The
+        new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
+        and `quantile_per_tree = False` (No per tree quantile computation).
+        The new backend is considered stable for classification tasks but
+        not yet for the regression task. The RAPIDS team is continuing
+        optimization and evaluation of the new backend for regression tasks.
     n_streams : int (default = 4 )
         Number of parallel streams used for forest building
     workers : optional, list of strings

@@ -208,8 +208,10 @@ class RandomForestClassifier(BaseRandomForestModel,
         If 'auto' then max_features=1/sqrt(n_features).
         If 'sqrt' then max_features=1/sqrt(n_features).
         If 'log2' then max_features=log2(n_features)/n_features.
-    n_bins : int (default = 8)
+    n_bins : int (default = 32)
         Number of bins used by the split algorithm.
+        For large problems, particularly those with highly-skewed input data,
+        increasing the number of bins may improve accuracy.
     min_samples_leaf : int or float (default = 1)
         The minimum number of samples (rows) in each leaf node.
         If int, then min_samples_leaf represents the minimum number.
@@ -232,11 +234,14 @@ class RandomForestClassifier(BaseRandomForestModel,
         .. deprecated:: 0.19
            Parameter 'quantile_per_tree' is deprecated and will be removed in
            subsequent release.
-    use_experimental_backend : boolean (default = False)
-        If set to true and the following conditions are also met, experimental
-        decision tree training implementation would be used only if
-        `split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False`
-        (No per tree quantile computation).
+    use_experimental_backend : boolean (default = True)
+        If set to true and the following conditions are also met, a new
+        experimental backend for decision tree training will be used. The
+        new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
+        and `quantile_per_tree = False` (No per tree quantile computation).
+        The new backend is considered stable for the classification task but
+        not for the regression task. We are currently evaluating the impact
+        of the new backend for the regression task.
     max_batch_size: int (default = 128)
         Maximum number of nodes that can be processed in a given batch. This is
         used only when 'use_experimental_backend' is true. Does not currently
@@ -265,7 +270,8 @@ class RandomForestClassifier(BaseRandomForestModel,
     """
 
     def __init__(self, *, split_criterion=0, handle=None, verbose=False,
-                 output_type=None, **kwargs):
+                 output_type=None, n_bins=32, use_experimental_backend=True,
+                 **kwargs):
 
         self.RF_type = CLASSIFICATION
         self.num_classes = 2
@@ -274,6 +280,8 @@ class RandomForestClassifier(BaseRandomForestModel,
             handle=handle,
             verbose=verbose,
             output_type=output_type,
+            n_bins=n_bins,
+            use_experimental_backend=use_experimental_backend,
             **kwargs)
 
     """

@@ -194,6 +194,8 @@ class RandomForestRegressor(BaseRandomForestModel,
         If 'log2' then max_features=log2(n_features)/n_features.
     n_bins : int (default = 8)
         Number of bins used by the split algorithm.
+        For large problems, particularly those with highly-skewed input data,
+        increasing the number of bins may improve accuracy.
     min_samples_leaf : int or float (default = 1)
         The minimum number of samples (rows) in each leaf node.
         If int, then min_samples_leaf represents the minimum number.
@@ -224,10 +226,13 @@ class RandomForestRegressor(BaseRandomForestModel,
            Parameter 'quantile_per_tree' is deprecated and will be removed in
            subsequent release.
     use_experimental_backend : boolean (default = False)
-        If set to true and  following conditions are also met, experimental
-        decision tree training implementation would be used only if
-        `split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False`
-        (No per tree quantile computation).
+        If set to true and the following conditions are also met, a new
+        experimental backend for decision tree training will be used. The
+        new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
+        and `quantile_per_tree = False` (No per tree quantile computation).
+        The new backend is considered stable for the classification task but
+        not for the regression task. We are currently evaluating the impact
+        of the new backend for the regression task.
     max_batch_size: int (default = 128)
         Maximum number of nodes that can be processed in a given batch. This is
         used only when 'use_experimental_backend' is true.

@@ -470,7 +470,7 @@ def test_rf_instance_count(client, max_depth, n_estimators):
                         n_bins=16, split_algo=1, split_criterion=0,
                         min_samples_leaf=2, seed=23707, n_streams=1,
                         n_estimators=n_estimators, max_leaves=-1,
-                        max_depth=max_depth, use_experimental_backend=True)
+                        max_depth=max_depth)
     y = y.astype(np.int32)
 
     X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)

@@ -145,10 +145,8 @@ def special_reg(request):
 @pytest.mark.parametrize('datatype', [np.float32])
 @pytest.mark.parametrize('split_algo', [0, 1])
 @pytest.mark.parametrize('max_features', [1.0, 'auto', 'log2', 'sqrt'])
-@pytest.mark.parametrize('use_experimental_backend', [True, False])
 def test_rf_classification(small_clf, datatype, split_algo,
-                           max_samples, max_features,
-                           use_experimental_backend):
+                           max_samples, max_features):
     use_handle = True
 
     X, y = small_clf
@@ -165,26 +163,24 @@ def test_rf_classification(small_clf, datatype, split_algo,
                        n_bins=16, split_algo=split_algo, split_criterion=0,
                        min_samples_leaf=2, random_state=123, n_streams=1,
                        n_estimators=40, handle=handle, max_leaves=-1,
-                       max_depth=16,
-                       use_experimental_backend=use_experimental_backend)
+                       max_depth=16)
     f = io.StringIO()
     with redirect_stdout(f):
         cuml_model.fit(X_train, y_train)
     captured_stdout = f.getvalue()
-    if use_experimental_backend:
-        is_fallback_used = False
-        if split_algo != 1:
-            assert ('Experimental backend does not yet support histogram ' +
-                    'split algorithm' in captured_stdout)
-            is_fallback_used = True
-        if is_fallback_used:
-            assert ('Not using the experimental backend due to above ' +
-                    'mentioned reason(s)' in captured_stdout)
-        else:
-            assert ('Using experimental backend for growing trees'
-                    in captured_stdout)
+
+    is_fallback_used = False
+    if split_algo != 1:
+        assert ('Experimental backend does not yet support histogram ' +
+                'split algorithm' in captured_stdout)
+        is_fallback_used = True
+    if is_fallback_used:
+        assert ('Not using the experimental backend due to above ' +
+                'mentioned reason(s)' in captured_stdout)
     else:
-        assert captured_stdout == ''
+        assert ('Using experimental backend for growing trees'
+                in captured_stdout)
+
     fil_preds = cuml_model.predict(X_test,
                                    predict_model="GPU",
                                    threshold=0.5,
@@ -856,8 +852,7 @@ def predict_with_json_rf_regressor(rf, x):
 
 @pytest.mark.parametrize('max_depth', [1, 2, 3, 5, 10, 15, 20])
 @pytest.mark.parametrize('n_estimators', [5, 10, 20])
-@pytest.mark.parametrize('use_experimental_backend', [True, False])
-def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
+def test_rf_instance_count(max_depth, n_estimators):
     X, y = make_classification(n_samples=350, n_features=20,
                                n_clusters_per_class=1, n_informative=10,
                                random_state=123, n_classes=2)
@@ -866,8 +861,7 @@ def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
                        n_bins=16, split_algo=1, split_criterion=0,
                        min_samples_leaf=2, random_state=23707, n_streams=1,
                        n_estimators=n_estimators, max_leaves=-1,
-                       max_depth=max_depth,
-                       use_experimental_backend=use_experimental_backend)
+                       max_depth=max_depth)
     y = y.astype(np.int32)
 
     # Train model on the data
@@ -879,31 +873,21 @@ def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
     # The instance count of each node must be equal to the sum of
     # the instance counts of its children. Note that the instance count
     # is only available with the new backend.
-    if use_experimental_backend:
-        def check_instance_count_for_non_leaf(tree):
-            assert 'instance_count' in tree
-            if 'children' not in tree:
-                return
-            assert 'instance_count' in tree['children'][0]
-            assert 'instance_count' in tree['children'][1]
-            assert (tree['instance_count']
-                    == tree['children'][0]['instance_count']
-                    + tree['children'][1]['instance_count'])
-            check_instance_count_for_non_leaf(tree['children'][0])
-            check_instance_count_for_non_leaf(tree['children'][1])
-        for tree in json_obj:
-            check_instance_count_for_non_leaf(tree)
-            # The root's count must be equal to the number of rows in the data
-            assert tree['instance_count'] == X.shape[0]
-    else:
-        def assert_instance_count_absent(tree):
-            assert 'instance_count' not in tree
-            if 'children' not in tree:
-                return
-            assert_instance_count_absent(tree['children'][0])
-            assert_instance_count_absent(tree['children'][1])
-        for tree in json_obj:
-            assert_instance_count_absent(tree)
+    def check_instance_count_for_non_leaf(tree):
+        assert 'instance_count' in tree
+        if 'children' not in tree:
+            return
+        assert 'instance_count' in tree['children'][0]
+        assert 'instance_count' in tree['children'][1]
+        assert (tree['instance_count']
+                == tree['children'][0]['instance_count']
+                + tree['children'][1]['instance_count'])
+        check_instance_count_for_non_leaf(tree['children'][0])
+        check_instance_count_for_non_leaf(tree['children'][1])
+    for tree in json_obj:
+        check_instance_count_for_non_leaf(tree)
+        # The root's count must be equal to the number of rows in the data
+        assert tree['instance_count'] == X.shape[0]
 
 
 @pytest.mark.memleak