rapidsai · rapids-bot · Mar 30, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/docs/source/estimator_intro.ipynb b/docs/source/estimator_intro.ipynb
@@ -94,7 +94,7 @@
     "\n",
     "model = cuRF( max_depth = max_depth, \n",
     "              n_estimators = n_estimators,\n",
-    "              seed  = 0 )\n",
+    "              random_state  = 0 )\n",
     "\n",
     "trained_RF = model.fit ( X_train, y_train )\n",
     "\n",

diff --git a/notebooks/random_forest_demo.ipynb b/notebooks/random_forest_demo.ipynb
@@ -176,7 +176,7 @@
     "cuml_model = curfc(n_estimators=40,\n",
     "                   max_depth=16,\n",
     "                   max_features=1.0,\n",
-    "                   seed=10)\n",
+    "                   random_state=10)\n",
     "\n",
     "cuml_model.fit(X_cudf_train, y_cudf_train)"
    ]

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,22 +55,17 @@ class BaseRandomForestModel(Base):
 
     classes_ = CumlArrayDescriptor()
 
-    def __init__(self, *, split_criterion, seed=None,
-                 n_streams=8, n_estimators=100,
-                 max_depth=16, handle=None, max_features='auto',
-                 n_bins=8, split_algo=1, bootstrap=True,
-                 bootstrap_features=False,
-                 verbose=False, min_rows_per_node=None,
-                 min_samples_leaf=1, min_samples_split=2,
-                 rows_sample=None, max_samples=1.0, max_leaves=-1,
-                 accuracy_metric=None, dtype=None,
-                 output_type=None,
-                 min_weight_fraction_leaf=None, n_jobs=None,
-                 max_leaf_nodes=None, min_impurity_decrease=0.0,
-                 min_impurity_split=None, oob_score=None,
-                 random_state=None, warm_start=None, class_weight=None,
-                 quantile_per_tree=False, criterion=None,
-                 use_experimental_backend=False, max_batch_size=128):
+    def __init__(self, *, split_criterion, n_streams=8, n_estimators=100,
+                 max_depth=16, handle=None, max_features='auto', n_bins=8,
+                 split_algo=1, bootstrap=True, bootstrap_features=False,
+                 verbose=False, min_samples_leaf=1, min_samples_split=2,
+                 max_samples=1.0, max_leaves=-1, accuracy_metric=None,
+                 dtype=None, output_type=None, min_weight_fraction_leaf=None,
+                 n_jobs=None, max_leaf_nodes=None, min_impurity_decrease=0.0,
+                 min_impurity_split=None, oob_score=None, random_state=None,
+                 warm_start=None, class_weight=None, quantile_per_tree=False,
+                 criterion=None, use_experimental_backend=False,
+                 max_batch_size=128):
 
         sklearn_params = {"criterion": criterion,
                           "min_weight_fraction_leaf": min_weight_fraction_leaf,
@@ -89,37 +84,17 @@ class BaseRandomForestModel(Base):
                     "(https://docs.rapids.ai/api/cuml/nightly/"
                     "api.html#random-forest) for more information")
 
-        if seed is not None:
-            if random_state is None:
-                warnings.warn("Parameter 'seed' is deprecated and will be"
-                              " removed in 0.17. Please use 'random_state'"
-                              " instead. Setting 'random_state' as the"
-                              " curent 'seed' value",
-                              DeprecationWarning)
-                random_state = seed
-            else:
-                warnings.warn("Both 'seed' and 'random_state' parameters were"
-                              " set. Using 'random_state' since 'seed' is"
-                              " deprecated and will be removed in 0.17.",
-                              DeprecationWarning)
-
         if ((random_state is not None) and (n_streams != 1)):
             warnings.warn("For reproducible results in Random Forest"
                           " Classifier or for almost reproducible results"
                           " in Random Forest Regressor, n_streams==1 is "
                           "recommended. If n_streams is > 1, results may vary "
                           "due to stream/thread timing differences, even when "
                           "random_state is set")
-        if min_rows_per_node is not None:
-            warnings.warn("The 'min_rows_per_node' parameter is deprecated "
-                          "and will be removed in 0.18. Please use "
-                          "'min_samples_leaf' parameter instead.")
-            min_samples_leaf = min_rows_per_node
-        if rows_sample is not None:
-            warnings.warn("The 'rows_sample' parameter is deprecated and will "
-                          "be removed in 0.18. Please use 'max_samples' "
-                          "parameter instead.")
-            max_samples = rows_sample
+        if quantile_per_tree:
+            warnings.warn("The 'quantile_per_tree' parameter is deprecated "
+                          "and will be removed in 0.20 release. Instead use "
+                          "higher number of global quantile bins.")
         if handle is None:
             handle = Handle(n_streams)
 

@@ -129,40 +129,40 @@ class RandomForestClassifier(BaseRandomForestModel,
     Implements a Random Forest classifier model which fits multiple decision
     tree classifiers in an ensemble.
 
-    Note that the underlying algorithm for tree node splits differs from that
-    used in scikit-learn. By default, the cuML Random Forest uses a
-    histogram-based algorithms to determine splits, rather than an exact
-    count. You can tune the size of the histograms with the n_bins parameter.
-
-    .. note:: This is an early release of the cuML
-        Random Forest code. It contains a few known limitations:
-
-       * GPU-based inference is only supported if the model was trained
-         with 32-bit (float32) datatypes. CPU-based inference may be used
-         in this case as a slower fallback.
-       * Very deep / very wide models may exhaust available GPU memory.
-         Future versions of cuML will provide an alternative algorithm to
-         reduce memory consumption.
-       * While training the model for multi class classification problems,
-         using deep trees or `max_features=1.0` provides better performance.
+    .. note:: Note that the underlying algorithm for tree node splits differs
+      from that used in scikit-learn. By default, the cuML Random Forest uses a
+      histogram-based algorithm to determine splits, rather than an exact
+      count. You can tune the size of the histograms with the n_bins parameter.
+
+    **Known Limitations**: This is an early release of the cuML
+    Random Forest code. It contains a few known limitations:
+
+      * GPU-based inference is only supported if the model was trained
+        with 32-bit (float32) datatypes. CPU-based inference may be used
+        in this case as a slower fallback.
+      * Very deep / very wide models may exhaust available GPU memory.
+        Future versions of cuML will provide an alternative algorithm to
+        reduce memory consumption.
+      * While training the model for multi class classification problems,
+        using deep trees or `max_features=1.0` provides better performance.
 
     Examples
     --------
     .. code-block:: python
 
-            import numpy as np
-            from cuml.ensemble import RandomForestClassifier as cuRFC
+        import numpy as np
+        from cuml.ensemble import RandomForestClassifier as cuRFC
 
-            X = np.random.normal(size=(10,4)).astype(np.float32)
-            y = np.asarray([0,1]*5, dtype=np.int32)
+        X = np.random.normal(size=(10,4)).astype(np.float32)
+        y = np.asarray([0,1]*5, dtype=np.int32)
 
-            cuml_model = cuRFC(max_features=1.0,
-                               n_bins=8,
-                               n_estimators=40)
-            cuml_model.fit(X,y)
-            cuml_predict = cuml_model.predict(X)
+        cuml_model = cuRFC(max_features=1.0,
+                           n_bins=8,
+                           n_estimators=40)
+        cuml_model.fit(X,y)
+        cuml_predict = cuml_model.predict(X)
 
-            print("Predicted labels : ", cuml_predict)
+        print("Predicted labels : ", cuml_predict)
 
     Output:
 
@@ -180,7 +180,7 @@ class RandomForestClassifier(BaseRandomForestModel,
         (default = 0)
     split_algo : int (default = 1)
         The algorithm to determine how nodes are split in the tree.
-        0 for HIST and 1 for GLOBAL_QUANTILE. HIST curently uses a slower
+        0 for HIST and 1 for GLOBAL_QUANTILE. HIST currently uses a slower
         tree-building algorithm so GLOBAL_QUANTILE is recommended for most
         cases.
     bootstrap : boolean (default = True)
@@ -226,24 +226,25 @@ class RandomForestClassifier(BaseRandomForestModel,
         Minimum decrease in impurity requried for
         node to be spilt.
     quantile_per_tree : boolean (default = False)
-        Whether quantile is computed for individal trees in RF.
-        Only relevant for GLOBAL_QUANTILE split_algo.
+        Whether quantile is computed for individual trees in RF.
+        Only relevant when `split_algo = GLOBAL_QUANTILE`.
+
+        .. deprecated:: 0.19
+           Parameter 'quantile_per_tree' is deprecated and will be removed in
+           subsequent release.
     use_experimental_backend : boolean (default = False)
         If set to true and  following conditions are also met, experimental
-         decision tree training implementation would be used:
-            split_algo = 1 (GLOBAL_QUANTILE)
-            quantile_per_tree = false (No per tree quantile computation)
+        decision tree training implementation would be used only if
+        `split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False`
+        (No per tree quantile computation).
     max_batch_size: int (default = 128)
         Maximum number of nodes that can be processed in a given batch. This is
-        used only when 'use_experimental_backend' is true.
+        used only when 'use_experimental_backend' is true. Does not currently
+        fully guarantee the exact same results.
     random_state : int (default = None)
-        Seed for the random number generator. Unseeded by default.
-    seed : int (default = None)
-        Seed for the random number generator. Unseeded by default.
-
-        .. deprecated:: 0.16
-           Parameter `seed` is deprecated and will be removed in 0.17. Please
-           use `random_state` instead
+        Seed for the random number generator. Unseeded by default. Does not
+        currently fully guarantee the exact same results. **Note: Parameter
+        `seed` is removed since release 0.19.**
 
     handle : cuml.Handle
         Specifies the cuml.handle that holds internal CUDA state for

@@ -112,21 +112,20 @@ class RandomForestRegressor(BaseRandomForestModel,
     Implements a Random Forest regressor model which fits multiple decision
     trees in an ensemble.
 
-    .. note:: that the underlying algorithm for tree node splits differs from
-        that used in scikit-learn. By default, the cuML Random Forest uses a
-        histogram-based algorithm to determine splits, rather than an exact
-        count. You can tune the size of the histograms with the n_bins
-        parameter.
+    .. note:: Note that the underlying algorithm for tree node splits differs
+      from that used in scikit-learn. By default, the cuML Random Forest uses a
+      histogram-based algorithm to determine splits, rather than an exact
+      count. You can tune the size of the histograms with the n_bins parameter.
 
     **Known Limitations**: This is an early release of the cuML
     Random Forest code. It contains a few known limitations:
 
-     * GPU-based inference is only supported if the model was trained
-       with 32-bit (float32) datatypes. CPU-based inference may be used
-       in this case as a slower fallback.
-     * Very deep / very wide models may exhaust available GPU memory.
-       Future versions of cuML will provide an alternative algorithm to
-       reduce memory consumption.
+      * GPU-based inference is only supported if the model was trained
+        with 32-bit (float32) datatypes. CPU-based inference may be used
+        in this case as a slower fallback.
+      * Very deep / very wide models may exhaust available GPU memory.
+        Future versions of cuML will provide an alternative algorithm to
+        reduce memory consumption.
 
     Examples
     --------
@@ -149,7 +148,7 @@ class RandomForestRegressor(BaseRandomForestModel,
 
     Output:
 
-    .. code-block:: python
+    .. code-block:: none
 
         MSE score of cuml :  0.1123437201231765
 
@@ -159,7 +158,7 @@ class RandomForestRegressor(BaseRandomForestModel,
         Number of trees in the forest. (Default changed to 100 in cuML 0.11)
     split_algo : int (default = 1)
         The algorithm to determine how nodes are split in the tree.
-        0 for HIST and 1 for GLOBAL_QUANTILE. HIST curently uses a slower
+        0 for HIST and 1 for GLOBAL_QUANTILE. HIST currently uses a slower
         tree-building algorithm so GLOBAL_QUANTILE is recommended for most
         cases.
     split_criterion : int (default = 2)
@@ -218,26 +217,24 @@ class RandomForestRegressor(BaseRandomForestModel,
         for mean of abs error : 'mean_ae'
         for mean square error' : 'mse'
     quantile_per_tree : boolean (default = False)
-        Whether quantile is computed for individal trees in RF.
-        Only relevant for GLOBAL_QUANTILE split_algo.
+        Whether quantile is computed for individual trees in RF.
+        Only relevant when `split_algo = GLOBAL_QUANTILE`.
+
+        .. deprecated:: 0.19
+           Parameter 'quantile_per_tree' is deprecated and will be removed in
+           subsequent release.
     use_experimental_backend : boolean (default = False)
         If set to true and  following conditions are also met, experimental
-         decision tree training implementation would be used:
-            split_algo = 1 (GLOBAL_QUANTILE)
-            quantile_per_tree = false (No per tree quantile computation)
+        decision tree training implementation would be used only if
+        `split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False`
+        (No per tree quantile computation).
     max_batch_size: int (default = 128)
         Maximum number of nodes that can be processed in a given batch. This is
         used only when 'use_experimental_backend' is true.
     random_state : int (default = None)
         Seed for the random number generator. Unseeded by default. Does not
-        currently fully guarantee the exact same results.
-    seed : int (default = None)
-        Seed for the random number generator. Unseeded by default. Does not
-        currently fully guarantee the exact same results.
-
-        .. deprecated:: 0.16
-           Parameter `seed` is deprecated and will be removed in 0.17. Please
-           use `random_state` instead
+        currently fully guarantee the exact same results. **Note: Parameter
+        `seed` is removed since release 0.19.**
 
     handle : cuml.Handle
         Specifies the cuml.handle that holds internal CUDA state for

@@ -781,14 +781,14 @@ def test_rf_get_json(estimator_type, max_depth, n_estimators):
     if estimator_type == 'classification':
         cuml_model = curfc(max_features=1.0, max_samples=1.0,
                            n_bins=16, split_algo=0, split_criterion=0,
-                           min_samples_leaf=2, seed=23707, n_streams=1,
+                           min_samples_leaf=2, random_state=23707, n_streams=1,
                            n_estimators=n_estimators, max_leaves=-1,
                            max_depth=max_depth)
         y = y.astype(np.int32)
     elif estimator_type == 'regression':
         cuml_model = curfr(max_features=1.0, max_samples=1.0,
                            n_bins=16, split_algo=0,
-                           min_samples_leaf=2, seed=23707, n_streams=1,
+                           min_samples_leaf=2, random_state=23707, n_streams=1,
                            n_estimators=n_estimators, max_leaves=-1,
                            max_depth=max_depth)
         y = y.astype(np.float32)
@@ -862,7 +862,7 @@ def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
     X = X.astype(np.float32)
     cuml_model = curfc(max_features=1.0, max_samples=1.0,
                        n_bins=16, split_algo=1, split_criterion=0,
-                       min_samples_leaf=2, seed=23707, n_streams=1,
+                       min_samples_leaf=2, random_state=23707, n_streams=1,
                        n_estimators=n_estimators, max_leaves=-1,
                        max_depth=max_depth,
                        use_experimental_backend=use_experimental_backend)
@@ -1031,7 +1031,7 @@ def test_rf_regression_with_identical_labels(split_criterion,
     # Degenerate case: all labels are identical.
     # RF Regressor must not create any split. It must yield an empty tree
     # with only the root node.
-    clf = curfr(max_features=1.0, rows_sample=1.0, n_bins=5, split_algo=1,
+    clf = curfr(max_features=1.0, max_samples=1.0, n_bins=5, split_algo=1,
                 bootstrap=False, split_criterion=split_criterion,
                 min_samples_leaf=1, min_samples_split=2, random_state=0,
                 n_streams=1, n_estimators=1, max_depth=1,