diff --git a/docs/source/estimator_intro.ipynb b/docs/source/estimator_intro.ipynb index b669420dcd..f9ac8b2866 100644 --- a/docs/source/estimator_intro.ipynb +++ b/docs/source/estimator_intro.ipynb @@ -94,7 +94,7 @@ "\n", "model = cuRF( max_depth = max_depth, \n", " n_estimators = n_estimators,\n", - " seed = 0 )\n", + " random_state = 0 )\n", "\n", "trained_RF = model.fit ( X_train, y_train )\n", "\n", diff --git a/notebooks/random_forest_demo.ipynb b/notebooks/random_forest_demo.ipynb index 34190efb50..a94b41cfe3 100755 --- a/notebooks/random_forest_demo.ipynb +++ b/notebooks/random_forest_demo.ipynb @@ -176,7 +176,7 @@ "cuml_model = curfc(n_estimators=40,\n", " max_depth=16,\n", " max_features=1.0,\n", - " seed=10)\n", + " random_state=10)\n", "\n", "cuml_model.fit(X_cudf_train, y_cudf_train)" ] diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 3d9e41a1f1..81e8bb3e31 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -55,22 +55,17 @@ class BaseRandomForestModel(Base): classes_ = CumlArrayDescriptor() - def __init__(self, *, split_criterion, seed=None, - n_streams=8, n_estimators=100, - max_depth=16, handle=None, max_features='auto', - n_bins=8, split_algo=1, bootstrap=True, - bootstrap_features=False, - verbose=False, min_rows_per_node=None, - min_samples_leaf=1, min_samples_split=2, - rows_sample=None, max_samples=1.0, max_leaves=-1, - accuracy_metric=None, dtype=None, - output_type=None, - min_weight_fraction_leaf=None, n_jobs=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, oob_score=None, - random_state=None, warm_start=None, class_weight=None, - quantile_per_tree=False, criterion=None, - use_experimental_backend=False, max_batch_size=128): + def __init__(self, *, split_criterion, n_streams=8, n_estimators=100, + max_depth=16, handle=None, max_features='auto', n_bins=8, + split_algo=1, bootstrap=True, bootstrap_features=False, + verbose=False, min_samples_leaf=1, min_samples_split=2, + max_samples=1.0, max_leaves=-1, accuracy_metric=None, + dtype=None, output_type=None, min_weight_fraction_leaf=None, + n_jobs=None, max_leaf_nodes=None, min_impurity_decrease=0.0, + min_impurity_split=None, oob_score=None, random_state=None, + warm_start=None, class_weight=None, quantile_per_tree=False, + criterion=None, use_experimental_backend=False, + max_batch_size=128): sklearn_params = {"criterion": criterion, "min_weight_fraction_leaf": min_weight_fraction_leaf, @@ -89,20 +84,6 @@ class BaseRandomForestModel(Base): "(https://docs.rapids.ai/api/cuml/nightly/" "api.html#random-forest) for more information") - if seed is not None: - if random_state is None: - warnings.warn("Parameter 'seed' is deprecated and will be" - " removed in 0.17. Please use 'random_state'" - " instead. Setting 'random_state' as the" - " curent 'seed' value", - DeprecationWarning) - random_state = seed - else: - warnings.warn("Both 'seed' and 'random_state' parameters were" - " set. Using 'random_state' since 'seed' is" - " deprecated and will be removed in 0.17.", - DeprecationWarning) - if ((random_state is not None) and (n_streams != 1)): warnings.warn("For reproducible results in Random Forest" " Classifier or for almost reproducible results" @@ -110,16 +91,10 @@ class BaseRandomForestModel(Base): "recommended. If n_streams is > 1, results may vary " "due to stream/thread timing differences, even when " "random_state is set") - if min_rows_per_node is not None: - warnings.warn("The 'min_rows_per_node' parameter is deprecated " - "and will be removed in 0.18. Please use " - "'min_samples_leaf' parameter instead.") - min_samples_leaf = min_rows_per_node - if rows_sample is not None: - warnings.warn("The 'rows_sample' parameter is deprecated and will " - "be removed in 0.18. Please use 'max_samples' " - "parameter instead.") - max_samples = rows_sample + if quantile_per_tree: + warnings.warn("The 'quantile_per_tree' parameter is deprecated " + "and will be removed in 0.20 release. Instead use " + "higher number of global quantile bins.") if handle is None: handle = Handle(n_streams) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 2b25988c67..16281a8a49 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -129,40 +129,40 @@ class RandomForestClassifier(BaseRandomForestModel, Implements a Random Forest classifier model which fits multiple decision tree classifiers in an ensemble. - Note that the underlying algorithm for tree node splits differs from that - used in scikit-learn. By default, the cuML Random Forest uses a - histogram-based algorithms to determine splits, rather than an exact - count. You can tune the size of the histograms with the n_bins parameter. - - .. note:: This is an early release of the cuML - Random Forest code. It contains a few known limitations: - - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. - * Very deep / very wide models may exhaust available GPU memory. - Future versions of cuML will provide an alternative algorithm to - reduce memory consumption. - * While training the model for multi class classification problems, - using deep trees or `max_features=1.0` provides better performance. + .. note:: Note that the underlying algorithm for tree node splits differs + from that used in scikit-learn. By default, the cuML Random Forest uses a + histogram-based algorithm to determine splits, rather than an exact + count. You can tune the size of the histograms with the n_bins parameter. + + **Known Limitations**: This is an early release of the cuML + Random Forest code. It contains a few known limitations: + + * GPU-based inference is only supported if the model was trained + with 32-bit (float32) datatypes. CPU-based inference may be used + in this case as a slower fallback. + * Very deep / very wide models may exhaust available GPU memory. + Future versions of cuML will provide an alternative algorithm to + reduce memory consumption. + * While training the model for multi class classification problems, + using deep trees or `max_features=1.0` provides better performance. Examples -------- .. code-block:: python - import numpy as np - from cuml.ensemble import RandomForestClassifier as cuRFC + import numpy as np + from cuml.ensemble import RandomForestClassifier as cuRFC - X = np.random.normal(size=(10,4)).astype(np.float32) - y = np.asarray([0,1]*5, dtype=np.int32) + X = np.random.normal(size=(10,4)).astype(np.float32) + y = np.asarray([0,1]*5, dtype=np.int32) - cuml_model = cuRFC(max_features=1.0, - n_bins=8, - n_estimators=40) - cuml_model.fit(X,y) - cuml_predict = cuml_model.predict(X) + cuml_model = cuRFC(max_features=1.0, + n_bins=8, + n_estimators=40) + cuml_model.fit(X,y) + cuml_predict = cuml_model.predict(X) - print("Predicted labels : ", cuml_predict) + print("Predicted labels : ", cuml_predict) Output: @@ -180,7 +180,7 @@ class RandomForestClassifier(BaseRandomForestModel, (default = 0) split_algo : int (default = 1) The algorithm to determine how nodes are split in the tree. - 0 for HIST and 1 for GLOBAL_QUANTILE. HIST curently uses a slower + 0 for HIST and 1 for GLOBAL_QUANTILE. HIST currently uses a slower tree-building algorithm so GLOBAL_QUANTILE is recommended for most cases. bootstrap : boolean (default = True) @@ -226,24 +226,25 @@ class RandomForestClassifier(BaseRandomForestModel, Minimum decrease in impurity requried for node to be spilt. quantile_per_tree : boolean (default = False) - Whether quantile is computed for individal trees in RF. - Only relevant for GLOBAL_QUANTILE split_algo. + Whether quantile is computed for individual trees in RF. + Only relevant when `split_algo = GLOBAL_QUANTILE`. + + .. deprecated:: 0.19 + Parameter 'quantile_per_tree' is deprecated and will be removed in + subsequent release. use_experimental_backend : boolean (default = False) If set to true and following conditions are also met, experimental - decision tree training implementation would be used: - split_algo = 1 (GLOBAL_QUANTILE) - quantile_per_tree = false (No per tree quantile computation) + decision tree training implementation would be used only if + `split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False` + (No per tree quantile computation). max_batch_size: int (default = 128) Maximum number of nodes that can be processed in a given batch. This is - used only when 'use_experimental_backend' is true. + used only when 'use_experimental_backend' is true. Does not currently + fully guarantee the exact same results. random_state : int (default = None) - Seed for the random number generator. Unseeded by default. - seed : int (default = None) - Seed for the random number generator. Unseeded by default. - - .. deprecated:: 0.16 - Parameter `seed` is deprecated and will be removed in 0.17. Please - use `random_state` instead + Seed for the random number generator. Unseeded by default. Does not + currently fully guarantee the exact same results. **Note: Parameter + `seed` is removed since release 0.19.** handle : cuml.Handle Specifies the cuml.handle that holds internal CUDA state for diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index f36143e157..c16ea32487 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -112,21 +112,20 @@ class RandomForestRegressor(BaseRandomForestModel, Implements a Random Forest regressor model which fits multiple decision trees in an ensemble. - .. note:: that the underlying algorithm for tree node splits differs from - that used in scikit-learn. By default, the cuML Random Forest uses a - histogram-based algorithm to determine splits, rather than an exact - count. You can tune the size of the histograms with the n_bins - parameter. + .. note:: Note that the underlying algorithm for tree node splits differs + from that used in scikit-learn. By default, the cuML Random Forest uses a + histogram-based algorithm to determine splits, rather than an exact + count. You can tune the size of the histograms with the n_bins parameter. **Known Limitations**: This is an early release of the cuML Random Forest code. It contains a few known limitations: - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. - * Very deep / very wide models may exhaust available GPU memory. - Future versions of cuML will provide an alternative algorithm to - reduce memory consumption. + * GPU-based inference is only supported if the model was trained + with 32-bit (float32) datatypes. CPU-based inference may be used + in this case as a slower fallback. + * Very deep / very wide models may exhaust available GPU memory. + Future versions of cuML will provide an alternative algorithm to + reduce memory consumption. Examples -------- @@ -149,7 +148,7 @@ class RandomForestRegressor(BaseRandomForestModel, Output: - .. code-block:: python + .. code-block:: none MSE score of cuml : 0.1123437201231765 @@ -159,7 +158,7 @@ class RandomForestRegressor(BaseRandomForestModel, Number of trees in the forest. (Default changed to 100 in cuML 0.11) split_algo : int (default = 1) The algorithm to determine how nodes are split in the tree. - 0 for HIST and 1 for GLOBAL_QUANTILE. HIST curently uses a slower + 0 for HIST and 1 for GLOBAL_QUANTILE. HIST currently uses a slower tree-building algorithm so GLOBAL_QUANTILE is recommended for most cases. split_criterion : int (default = 2) @@ -218,26 +217,24 @@ class RandomForestRegressor(BaseRandomForestModel, for mean of abs error : 'mean_ae' for mean square error' : 'mse' quantile_per_tree : boolean (default = False) - Whether quantile is computed for individal trees in RF. - Only relevant for GLOBAL_QUANTILE split_algo. + Whether quantile is computed for individual trees in RF. + Only relevant when `split_algo = GLOBAL_QUANTILE`. + + .. deprecated:: 0.19 + Parameter 'quantile_per_tree' is deprecated and will be removed in + subsequent release. use_experimental_backend : boolean (default = False) If set to true and following conditions are also met, experimental - decision tree training implementation would be used: - split_algo = 1 (GLOBAL_QUANTILE) - quantile_per_tree = false (No per tree quantile computation) + decision tree training implementation would be used only if + `split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False` + (No per tree quantile computation). max_batch_size: int (default = 128) Maximum number of nodes that can be processed in a given batch. This is used only when 'use_experimental_backend' is true. random_state : int (default = None) Seed for the random number generator. Unseeded by default. Does not - currently fully guarantee the exact same results. - seed : int (default = None) - Seed for the random number generator. Unseeded by default. Does not - currently fully guarantee the exact same results. - - .. deprecated:: 0.16 - Parameter `seed` is deprecated and will be removed in 0.17. Please - use `random_state` instead + currently fully guarantee the exact same results. **Note: Parameter + `seed` is removed since release 0.19.** handle : cuml.Handle Specifies the cuml.handle that holds internal CUDA state for diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 8a86b4b730..b3f9b3a508 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -781,14 +781,14 @@ def test_rf_get_json(estimator_type, max_depth, n_estimators): if estimator_type == 'classification': cuml_model = curfc(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, split_criterion=0, - min_samples_leaf=2, seed=23707, n_streams=1, + min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cuml_model = curfr(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, - min_samples_leaf=2, seed=23707, n_streams=1, + min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) @@ -862,7 +862,7 @@ def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend): X = X.astype(np.float32) cuml_model = curfc(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=1, split_criterion=0, - min_samples_leaf=2, seed=23707, n_streams=1, + min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth, use_experimental_backend=use_experimental_backend) @@ -1031,7 +1031,7 @@ def test_rf_regression_with_identical_labels(split_criterion, # Degenerate case: all labels are identical. # RF Regressor must not create any split. It must yield an empty tree # with only the root node. - clf = curfr(max_features=1.0, rows_sample=1.0, n_bins=5, split_algo=1, + clf = curfr(max_features=1.0, max_samples=1.0, n_bins=5, split_algo=1, bootstrap=False, split_criterion=split_criterion, min_samples_leaf=1, min_samples_split=2, random_state=0, n_streams=1, n_estimators=1, max_depth=1,