From d62d0320528fe46b6d75b4339b0423af63238b95 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Fri, 30 Jul 2021 20:16:59 +0530 Subject: [PATCH 01/42] update docsstrings and add std::round --- cpp/include/cuml/tree/decisiontree.hpp | 4 +- .../batched-levelalgo/builder.cuh | 4 +- python/cuml/benchmark/ci_benchmark.py | 1 - .../dask/ensemble/randomforestclassifier.py | 34 ++---- .../dask/ensemble/randomforestregressor.py | 36 ++---- python/cuml/ensemble/randomforest_common.pyx | 16 +-- .../cuml/ensemble/randomforestclassifier.pyx | 105 +++++++++--------- .../cuml/ensemble/randomforestregressor.pyx | 84 ++++++-------- python/cuml/test/test_metrics.py | 2 +- 9 files changed, 114 insertions(+), 172 deletions(-) diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp index 54020c45ec..22738cc537 100644 --- a/cpp/include/cuml/tree/decisiontree.hpp +++ b/cpp/include/cuml/tree/decisiontree.hpp @@ -29,11 +29,11 @@ namespace DT { struct DecisionTreeParams { /** - * Maximum tree depth. Unlimited (e.g., until leaves are pure), if -1. + * Maximum tree depth. Unlimited (e.g., until leaves are pure), If `-1`. */ int max_depth; /** - * Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1. + * Maximum leaf nodes per tree. Soft constraint. Unlimited, If `-1`. */ int max_leaves; /** diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index a46ee558f2..53f5787102 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -83,7 +83,7 @@ void grow_tree(std::shared_ptr d_allocator, nrows, ncols, n_sampled_rows, - IdxT(params.max_features * ncols), + IdxT(std::round(params.max_features * ncols)), rowids, unique_labels, quantiles); @@ -120,8 +120,6 @@ void grow_tree(std::shared_ptr d_allocator, * [on device] [col-major] * [dim = params.n_bins x ncols] * @param[in] rowids sampled rows [on device] [len = n_sampled_rows] - * @param[in] colids sampled cols [on device] - * [len = params.max_features * ncols] * @param[in] n_sampled_rows number of sub-sampled rows * @param[in] unique_labels number of classes (meaningful only for * classification) diff --git a/python/cuml/benchmark/ci_benchmark.py b/python/cuml/benchmark/ci_benchmark.py index 90fa9dce89..a4f0f908dc 100644 --- a/python/cuml/benchmark/ci_benchmark.py +++ b/python/cuml/benchmark/ci_benchmark.py @@ -173,7 +173,6 @@ def make_bench_configs(long_config): bench_dims=default_dims, cuml_param_override_list=[ {"n_bins": [8, 32]}, - {"split_algo": [0, 1]}, {"max_features": ['sqrt', 1.0]}, ], ) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 692d9e3a0e..19bce0795e 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -75,26 +75,21 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, handles in several streams. If it is None, a new one is created. split_criterion : The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. - 2 and 3 not valid for classification - (default = 0) - split_algo : 0 for HIST and 1 for GLOBAL_QUANTILE (default = 1) - the algorithm to determine how nodes are split in the tree. - split_criterion : The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. + 0 for Gini impurity, 1 for Entropy (Information Gain), + 3 for CRITERION_END. 2 and 3 not valid for classification (default = 0) bootstrap : boolean (default = True) Control bootstrapping. If set, each tree in the forest is built on a bootstrapped sample with replacement. - If False, the whole dataset is used to build each tree. + If `False`, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = -1) - Maximum tree depth. Unlimited (i.e, until leaves are pure), if -1. + Maximum tree depth. Unlimited (i.e, until leaves are pure), If `-1`. max_leaves : int (default = -1) - Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1. + Maximum leaf nodes per tree. Soft constraint. Unlimited, If `-1`. max_features : float (default = 'auto') Ratio of number of features (columns) to consider per node split. @@ -102,27 +97,16 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, Number of bins used by the split algorithm. min_samples_leaf : int or float (default = 1) The minimum number of samples (rows) in each leaf node. - If int, then min_samples_leaf represents the minimum number. + If type `int`, then `min_samples_leaf` represents the minimum number. If float, then min_samples_leaf represents a fraction and - ceil(min_samples_leaf * n_rows) is the minimum number of samples + `ceil(min_samples_leaf * n_rows)` is the minimum number of samples for each leaf node. min_samples_split : int or float (default = 2) The minimum number of samples required to split an internal node. - If int, then min_samples_split represents the minimum number. - If float, then min_samples_split represents a fraction and + If type `int`, then min_samples_split represents the minimum number. + If type `float`, then `min_samples_split` represents a fraction and ceil(min_samples_split * n_rows) is the minimum number of samples for each split. - quantile_per_tree : boolean (default = False) - Whether quantile is computed for individual RF trees. - Only relevant for GLOBAL_QUANTILE split_algo. - use_experimental_backend : boolean (default = True) - If set to true and the following conditions are also met, a new - experimental backend for decision tree training will be used. The - new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE) - and `quantile_per_tree = False` (No per tree quantile computation). - The new backend is considered stable for classification tasks but - not yet for regression tasks. The RAPIDS team is continuing - optimization and evaluation of the new backend for regression tasks. n_streams : int (default = 4 ) Number of parallel streams used for forest building workers : optional, list of strings diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index 3b21810fb4..e1a1d43676 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -68,25 +68,22 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, run different models concurrently in different streams by creating handles in several streams. If it is None, a new one is created. - split_algo : int (default = 1) - 0 for HIST, 1 for GLOBAL_QUANTILE - The type of algorithm to be used to create the trees. split_criterion : int (default = 2) The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, - 2 for MSE, 3 for MAE and 4 for CRITERION_END. + 0 for Gini impurity, 1 for Entropy (Information Gain), + 2 for MSE (Mean Squared Error), and 3 for CRITERION_END. 0 and 1 not valid for regression bootstrap : boolean (default = True) Control bootstrapping. If set, each tree in the forest is built on a bootstrapped sample with replacement. - If False, the whole dataset is used to build each tree. + If `False`, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = -1) - Maximum tree depth. Unlimited (i.e, until leaves are pure), if -1. + Maximum tree depth. Unlimited (i.e, until leaves are pure), If `-1`. max_leaves : int (default = -1) - Maximum leaf nodes per tree. Soft constraint. Unlimited, if -1. + Maximum leaf nodes per tree. Soft constraint. Unlimited, If `-1`. max_features : int or float or string or None (default = 'auto') Ratio of number of features (columns) to consider per node split. @@ -100,15 +97,15 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, Number of bins used by the split algorithm. min_samples_leaf : int or float (default = 1) The minimum number of samples (rows) in each leaf node. - If int, then min_samples_leaf represents the minimum number. - If float, then min_samples_leaf represents a fraction and - ceil(min_samples_leaf * n_rows) is the minimum number of samples + If type `int`, then `min_samples_leaf` represents the minimum number. + If `float`, then `min_samples_leaf` represents a fraction and + `ceil(min_samples_leaf * n_rows)` is the minimum number of samples for each leaf node. min_samples_split : int or float (default = 2) The minimum number of samples required to split an internal node. - If int, then min_samples_split represents the minimum number. - If float, then min_samples_split represents a fraction and - ceil(min_samples_split * n_rows) is the minimum number of samples + If type `int`, then `min_samples_split` represents the minimum number. + If type `float`, then `min_samples_split` represents a fraction and + `ceil(min_samples_split * n_rows)` is the minimum number of samples for each split. accuracy_metric : string (default = 'r2') Decides the metric used to evaluate the performance of the model. @@ -118,17 +115,6 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, for median of abs error : 'median_ae' for mean of abs error : 'mean_ae' for mean square error' : 'mse' - quantile_per_tree : boolean (default = False) - Whether quantile is computed for individual RF trees. - Only relevant for GLOBAL_QUANTILE split_algo. - use_experimental_backend : boolean (default = False) - If set to true and the following conditions are also met, a new - experimental backend for decision tree training will be used. The - new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE) - and `quantile_per_tree = False` (No per tree quantile computation). - The new backend is considered stable for classification tasks but - not yet for regression tasks. The RAPIDS team is continuing - optimization and evaluation of the new backend for regression tasks. n_streams : int (default = 4 ) Number of parallel streams used for forest building workers : optional, list of strings diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 1bba0d37a1..1e52ce3adc 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -41,14 +41,14 @@ from cuml.common.array_descriptor import CumlArrayDescriptor class BaseRandomForestModel(Base): _param_names = ['n_estimators', 'max_depth', 'handle', 'max_features', 'n_bins', - 'split_algo', 'split_criterion', 'min_samples_leaf', + 'split_criterion', 'min_samples_leaf', 'min_samples_split', 'min_impurity_decrease', 'bootstrap', 'verbose', 'max_samples', 'max_leaves', - 'accuracy_metric', 'use_experimental_backend', - 'max_batch_size', 'n_streams', 'dtype', + 'accuracy_metric', 'max_batch_size', + 'n_streams', 'dtype', 'output_type', 'min_weight_fraction_leaf', 'n_jobs', 'max_leaf_nodes', 'min_impurity_split', 'oob_score', 'random_state', 'warm_start', 'class_weight', @@ -100,18 +100,10 @@ class BaseRandomForestModel(Base): if ((random_state is not None) and (n_streams != 1)): warnings.warn("For reproducible results in Random Forest" " Classifier or for almost reproducible results" - " in Random Forest Regressor, n_streams==1 is " + " in Random Forest Regressor, n_streams=1 is " "recommended. If n_streams is > 1, results may vary " "due to stream/thread timing differences, even when " "random_state is set") - if 'use_experimental_backend' in kwargs.keys(): - warnings.warn("The 'use_experimental_backend' parameter is " - "deprecated and has no effect. " - "It will be removed in 21.10 release.") - if 'split_algo' in kwargs.keys(): - warnings.warn("The 'split_algo' parameter is " - "deprecated and has no effect. " - "It will be removed in 21.10 release.") if handle is None: handle = Handle(n_streams) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index f68bee6088..15eef87614 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -115,39 +115,14 @@ class RandomForestClassifier(BaseRandomForestModel, .. note:: Note that the underlying algorithm for tree node splits differs from that used in scikit-learn. By default, the cuML Random Forest uses a - histogram-based algorithm to determine splits, rather than an exact - count. You can tune the size of the histograms with the n_bins parameter. + quantile-based algorithm to determine splits, rather than an exact + count. You can tune the size of the quantiles with the `n_bins` parameter. .. note:: You can export cuML Random Forest models and run predictions with them on machines without an NVIDIA GPUs. See https://docs.rapids.ai/api/cuml/nightly/pickling_cuml_models.html for more details. - **Known Limitations**: This is an early release of the cuML - Random Forest code. It contains a few known limitations: - - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. - * Very deep / very wide models may exhaust available GPU memory. - Future versions of cuML will provide an alternative algorithm to - reduce memory consumption. - * While training the model for multi class classification problems, - using deep trees or `max_features=1.0` provides better performance. - * Prediction of classes is currently different from how scikit-learn - predicts: - * scikit-learn predicts random forest classifiers by obtaining class - probabilities from each component tree, then averaging these class - probabilities over all the ensemble members, and finally resolving - to the label with highest probability as prediction. - * cuml random forest classifier prediction differs in that, each - component tree generates labels instead of class probabilities; - with the most frequent label over all the trees (the statistical - mode) resolved as prediction. - The above differences might cause marginal variations in accuracy in - tradeoff to better performance. - See: https://github.com/rapidsai/cuml/issues/3764 - Examples -------- .. code-block:: python @@ -177,34 +152,32 @@ class RandomForestClassifier(BaseRandomForestModel, n_estimators : int (default = 100) Number of trees in the forest. (Default changed to 100 in cuML 0.11) split_criterion : The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY + 0 for Gini impurity, + 1 for Entropy (Information Gain). 2 and 3 not valid for classification (default = 0) - split_algo : int (default = 1) - Deprecated and currrently has no effect. - .. deprecated:: 21.06 bootstrap : boolean (default = True) Control bootstrapping. - If True, each tree in the forest is built + If `True`, eachtree in the forest is built on a bootstrapped sample with replacement. - If False, the whole dataset is used to build each tree. + If `False`, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = 16) Maximum tree depth. Unlimited (i.e, until leaves are pure), - if -1. Unlimited depth is not supported. + If `-1`. Unlimited depth is not supported. *Note that this default differs from scikit-learn's random forest, which defaults to unlimited depth.* max_leaves : int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, - if -1. + If `-1`. max_features : int, float, or string (default = 'auto') - Ratio of number of features (columns) to consider per node split. - If int then max_features/n_features. - If float then max_features is used as a fraction. - If 'auto' then max_features=1/sqrt(n_features). - If 'sqrt' then max_features=1/sqrt(n_features). - If 'log2' then max_features=log2(n_features)/n_features. + Ratio of number of features (columns) to consider per node split.\n + If type `int` then `max_features` is the absolute count of features to be used\n + If type `float` then `max_features` is used as a fraction.\n + If `'auto'` then `max_features=1/sqrt(n_features)`.\n + If `'sqrt'` then `max_features=1/sqrt(n_features)`.\n + If `'log2'` then `max_features=log2(n_features)/n_features`. n_bins : int (default = 128) Number of bins used by the split algorithm. For large problems, particularly those with highly-skewed input data, @@ -212,30 +185,25 @@ class RandomForestClassifier(BaseRandomForestModel, n_streams : int (default = 4) Number of parallel streams used for forest building. min_samples_leaf : int or float (default = 1) - The minimum number of samples (rows) in each leaf node. - If int, then min_samples_leaf represents the minimum number. + The minimum number of samples (rows) in each leaf node.\n + If type `int`, then `min_samples_leaf` represents the minimum number.\n If float, then min_samples_leaf represents a fraction and - ceil(min_samples_leaf * n_rows) is the minimum number of samples + `ceil(min_samples_leaf * n_rows)` is the minimum number of samples for each leaf node. min_samples_split : int or float (default = 2) - The minimum number of samples required to split an internal node. - If int, then min_samples_split represents the minimum number. - If float, then min_samples_split represents a fraction and - ceil(min_samples_split * n_rows) is the minimum number of samples + The minimum number of samples required to split an internal node.\n + If type `int`, then min_samples_split represents the minimum number.\n + If type `float`, then `min_samples_split` represents a fraction and + `ceil(min_samples_split * n_rows)` is the minimum number of samples for each split. min_impurity_decrease : float (default = 0.0) Minimum decrease in impurity requried for node to be spilt. - use_experimental_backend : boolean (default = True) - Deprecated and currrently has no effect. - .. deprecated:: 21.08 - max_batch_size: int (default = 4096) + max_batch_size : int (default = 4096) Maximum number of nodes that can be processed in a given batch. random_state : int (default = None) Seed for the random number generator. Unseeded by default. Does not - currently fully guarantee the exact same results. **Note: Parameter - `seed` is removed since release 0.19.** - + currently fully guarantee the exact same results. handle : cuml.Handle Specifies the cuml.handle that holds internal CUDA state for computations in this model. Most importantly, this specifies the CUDA @@ -252,6 +220,33 @@ class RandomForestClassifier(BaseRandomForestModel, module level, `cuml.global_settings.output_type`. See :ref:`output-data-type-configuration` for more info. + Notes + ----- + **Known Limitations**\n + This is an early release of the cuML + Random Forest code. It contains a few known limitations: + + * GPU-based inference is only supported if the model was trained + with 32-bit (float32) datatypes. CPU-based inference may be used + in this case as a slower fallback. + * While training the model for multi class classification problems, + using deep trees or `max_features=1.0` provides better performance. + * Prediction of classes is currently different from how scikit-learn + predicts: + * scikit-learn predicts random forest classifiers by obtaining class + probabilities from each component tree, then averaging these class + probabilities over all the ensemble members, and finally resolving + to the label with highest probability as prediction. + * cuml random forest classifier prediction differs in that, each + component tree generates labels instead of class probabilities; + with the most frequent label over all the trees (the statistical + mode) resolved as prediction. + The above differences might cause marginal variations in accuracy in + tradeoff to better performance. + See: https://github.com/rapidsai/cuml/issues/3764 + + For additional docs, see `scikitlearn's RandomForestClassifier + `_. """ def __init__(self, *, split_criterion=0, handle=None, verbose=False, diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index c96ff64eb6..870a1737eb 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -114,23 +114,14 @@ class RandomForestRegressor(BaseRandomForestModel, .. note:: Note that the underlying algorithm for tree node splits differs from that used in scikit-learn. By default, the cuML Random Forest uses a - histogram-based algorithm to determine splits, rather than an exact - count. You can tune the size of the histograms with the n_bins parameter. + quantile-based algorithm to determine splits, rather than an exact + count. You can tune the size of the quantiles with the `n_bins` parameter. .. note:: You can export cuML Random Forest models and run predictions with them on machines without an NVIDIA GPUs. See https://docs.rapids.ai/api/cuml/nightly/pickling_cuml_models.html for more details. - **Known Limitations**: This is an early release of the cuML - Random Forest code. It contains a few known limitations: - - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. - * Very deep / very wide models may exhaust available GPU memory. - Future versions of cuML will provide an alternative algorithm to - reduce memory consumption. Examples -------- @@ -140,7 +131,6 @@ class RandomForestRegressor(BaseRandomForestModel, import numpy as np from cuml.test.utils import get_handle from cuml.ensemble import RandomForestRegressor as curfc - from cuml.test.utils import get_handle X = np.asarray([[0,10],[0,20],[0,30],[0,40]], dtype=np.float32) y = np.asarray([0.0,1.0,2.0,3.0], dtype=np.float32) cuml_model = curfc(max_features=1.0, n_bins=128, @@ -161,45 +151,35 @@ class RandomForestRegressor(BaseRandomForestModel, ----------- n_estimators : int (default = 100) Number of trees in the forest. (Default changed to 100 in cuML 0.11) - split_algo : int (default = 1) - The algorithm to determine how nodes are split in the tree. - Can be changed only for the old backend [deprecated]. - 0 for HIST and 1 for GLOBAL_QUANTILE. Default is GLOBAL_QUANTILE. - The default backend does not support HIST. - HIST currently uses a slower tree-building algorithm so - GLOBAL_QUANTILE is recommended for most cases. - - .. deprecated:: 21.06 - Parameter 'split_algo' is deprecated and will be removed in - subsequent release. split_criterion : int (default = 2) The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, - 2 for MSE + 0 for Gini impurity, + 1 for Entropy (Information Gain), + 2 for MSE (Mean Squared Error). 0 and 1 not valid for regression bootstrap : boolean (default = True) Control bootstrapping. - If True, each tree in the forest is built + If `True`, eachtree in the forest is built on a bootstrapped sample with replacement. - If False, the whole dataset is used to build each tree. + If `False`, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = 16) Maximum tree depth. Unlimited (i.e, until leaves are pure), - if -1. + If `-1`. *Note that this default differs from scikit-learn's random forest, which defaults to unlimited depth.* max_leaves : int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, - if -1. + If `-1`. max_features : int, float, or string (default = 'auto') Ratio of number of features (columns) to consider - per node split. - If int then max_features/n_features. - If float then max_features is used as a fraction. - If 'auto' then max_features=1.0. - If 'sqrt' then max_features=1/sqrt(n_features). - If 'log2' then max_features=log2(n_features)/n_features. + per node split.\n + If type `int` then `max_features` is the absolute count of features to be used.\n + If type `float` then `max_features` is used as a fraction.\n + If `'auto'` then `max_features=1.0`.\n + If `'sqrt'` then `max_features=1/sqrt(n_features)`.\n + If `'log2'` then `max_features=log2(n_features)/n_features`. n_bins : int (default = 128) Number of bins used by the split algorithm. For large problems, particularly those with highly-skewed input data, @@ -207,16 +187,16 @@ class RandomForestRegressor(BaseRandomForestModel, n_streams : int (default = 4 ) Number of parallel streams used for forest building min_samples_leaf : int or float (default = 1) - The minimum number of samples (rows) in each leaf node. - If int, then min_samples_leaf represents the minimum number. + The minimum number of samples (rows) in each leaf node.\n + If type `int`, then `min_samples_leaf` represents the minimum number.\n If float, then min_samples_leaf represents a fraction and - ceil(min_samples_leaf * n_rows) is the minimum number of samples + `ceil(min_samples_leaf * n_rows)` is the minimum number of samples for each leaf node. min_samples_split : int or float (default = 2) - The minimum number of samples required to split an internal node. - If int, then min_samples_split represents the minimum number. - If float, then min_samples_split represents a fraction and - ceil(min_samples_split * n_rows) is the minimum number of samples + The minimum number of samples required to split an internal node.\n + If type `int`, then min_samples_split represents the minimum number.\n + If type `float`, then `min_samples_split` represents a fraction and + `ceil(min_samples_split * n_rows)` is the minimum number of samples for each split. min_impurity_decrease : float (default = 0.0) The minimum decrease in impurity required for node to be split @@ -228,15 +208,11 @@ class RandomForestRegressor(BaseRandomForestModel, for median of abs error : 'median_ae' for mean of abs error : 'mean_ae' for mean square error' : 'mse' - use_experimental_backend : boolean (default = True) - Deprecated and currrently has no effect. - .. deprecated:: 21.08 - max_batch_size: int (default = 4096) + max_batch_size : int (default = 4096) Maximum number of nodes that can be processed in a given batch. random_state : int (default = None) Seed for the random number generator. Unseeded by default. Does not - currently fully guarantee the exact same results. **Note: Parameter - `seed` is removed since release 0.19.** + currently fully guarantee the exact same results. handle : cuml.Handle Specifies the cuml.handle that holds internal CUDA state for computations in this model. Most importantly, this specifies the CUDA @@ -253,6 +229,18 @@ class RandomForestRegressor(BaseRandomForestModel, module level, `cuml.global_settings.output_type`. See :ref:`output-data-type-configuration` for more info. + Notes + ----- + **Known Limitations**\n + This is an early release of the cuML + Random Forest code. It contains a few known limitations: + + * GPU-based inference is only supported if the model was trained + with 32-bit (float32) datatypes. CPU-based inference may be used + in this case as a slower fallback. + + For additional docs, see `scikitlearn's RandomForestRegressor + `_. """ def __init__(self, *, diff --git a/python/cuml/test/test_metrics.py b/python/cuml/test/test_metrics.py index 8d642b72fd..35ab87a977 100644 --- a/python/cuml/test/test_metrics.py +++ b/python/cuml/test/test_metrics.py @@ -186,7 +186,7 @@ def test_accuracy(nrows, ncols, n_info, datatype): # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, - n_bins=8, split_algo=0, split_criterion=0, + n_bins=8, split_criterion=0, min_samples_leaf=2, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) From 994c10eade4c4a495c06c47407dfcfa29488d2f3 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 2 Aug 2021 19:46:18 +0530 Subject: [PATCH 02/42] suggest alternatives for GPU inference --- python/cuml/ensemble/randomforestclassifier.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 15eef87614..0ece73a5b8 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -226,9 +226,9 @@ class RandomForestClassifier(BaseRandomForestModel, This is an early release of the cuML Random Forest code. It contains a few known limitations: - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. + * GPU-based inference is only supported with 32-bit (float32) datatypes. + Alternatives are to use CPU-based inference for 64-bit (float64) datatypes, + or let the default automatic datatype conversion occur during GPU inference. * While training the model for multi class classification problems, using deep trees or `max_features=1.0` provides better performance. * Prediction of classes is currently different from how scikit-learn From efb177350ca1ef459ec1f9baaade32c2bf5fc8b0 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 2 Aug 2021 19:54:10 +0530 Subject: [PATCH 03/42] update previous commit for regressor docs --- python/cuml/ensemble/randomforestregressor.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 870a1737eb..0aeea6f222 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -235,9 +235,9 @@ class RandomForestRegressor(BaseRandomForestModel, This is an early release of the cuML Random Forest code. It contains a few known limitations: - * GPU-based inference is only supported if the model was trained - with 32-bit (float32) datatypes. CPU-based inference may be used - in this case as a slower fallback. + * GPU-based inference is only supported with 32-bit (float32) datatypes. + Alternatives are to use CPU-based inference for 64-bit (float64) datatypes, + or let the default automatic datatype conversion occur during GPU inference. For additional docs, see `scikitlearn's RandomForestRegressor `_. From 7de63e5139240d14ac61d44fade4c6ab738f9cba Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 2 Aug 2021 19:55:25 +0530 Subject: [PATCH 04/42] copyright fix --- python/cuml/benchmark/ci_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/benchmark/ci_benchmark.py b/python/cuml/benchmark/ci_benchmark.py index a4f0f908dc..666fb39a7c 100644 --- a/python/cuml/benchmark/ci_benchmark.py +++ b/python/cuml/benchmark/ci_benchmark.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From d298d3060de9f60d4b46e27e84117cc0524af092 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Thu, 5 Aug 2021 14:05:01 +0530 Subject: [PATCH 05/42] flake8 fix --- python/cuml/ensemble/randomforestclassifier.pyx | 11 +++++++---- python/cuml/ensemble/randomforestregressor.pyx | 10 ++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 0ece73a5b8..c2b0a69493 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -116,7 +116,8 @@ class RandomForestClassifier(BaseRandomForestModel, .. note:: Note that the underlying algorithm for tree node splits differs from that used in scikit-learn. By default, the cuML Random Forest uses a quantile-based algorithm to determine splits, rather than an exact - count. You can tune the size of the quantiles with the `n_bins` parameter. + count. You can tune the size of the quantiles with the `n_bins` + parameter. .. note:: You can export cuML Random Forest models and run predictions with them on machines without an NVIDIA GPUs. See @@ -173,7 +174,8 @@ class RandomForestClassifier(BaseRandomForestModel, If `-1`. max_features : int, float, or string (default = 'auto') Ratio of number of features (columns) to consider per node split.\n - If type `int` then `max_features` is the absolute count of features to be used\n + If type `int` then `max_features` is the absolute count of features to + be used\n If type `float` then `max_features` is used as a fraction.\n If `'auto'` then `max_features=1/sqrt(n_features)`.\n If `'sqrt'` then `max_features=1/sqrt(n_features)`.\n @@ -227,8 +229,9 @@ class RandomForestClassifier(BaseRandomForestModel, Random Forest code. It contains a few known limitations: * GPU-based inference is only supported with 32-bit (float32) datatypes. - Alternatives are to use CPU-based inference for 64-bit (float64) datatypes, - or let the default automatic datatype conversion occur during GPU inference. + Alternatives are to use CPU-based inference for 64-bit (float64) + datatypes, or let the default automatic datatype conversion occur + during GPU inference. * While training the model for multi class classification problems, using deep trees or `max_features=1.0` provides better performance. * Prediction of classes is currently different from how scikit-learn diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 0aeea6f222..e8fcea56d8 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -115,7 +115,7 @@ class RandomForestRegressor(BaseRandomForestModel, .. note:: Note that the underlying algorithm for tree node splits differs from that used in scikit-learn. By default, the cuML Random Forest uses a quantile-based algorithm to determine splits, rather than an exact - count. You can tune the size of the quantiles with the `n_bins` parameter. + count. You can tune the size of the quantiles with the `n_bins` parameter .. note:: You can export cuML Random Forest models and run predictions with them on machines without an NVIDIA GPUs. See @@ -175,7 +175,8 @@ class RandomForestRegressor(BaseRandomForestModel, max_features : int, float, or string (default = 'auto') Ratio of number of features (columns) to consider per node split.\n - If type `int` then `max_features` is the absolute count of features to be used.\n + If type `int` then `max_features` is the absolute count of features to + be used.\n If type `float` then `max_features` is used as a fraction.\n If `'auto'` then `max_features=1.0`.\n If `'sqrt'` then `max_features=1/sqrt(n_features)`.\n @@ -236,8 +237,9 @@ class RandomForestRegressor(BaseRandomForestModel, Random Forest code. It contains a few known limitations: * GPU-based inference is only supported with 32-bit (float32) datatypes. - Alternatives are to use CPU-based inference for 64-bit (float64) datatypes, - or let the default automatic datatype conversion occur during GPU inference. + Alternatives are to use CPU-based inference for 64-bit (float64) + datatypes, or let the default automatic datatype conversion occur + during GPU inference. For additional docs, see `scikitlearn's RandomForestRegressor `_. From c1bf494d79f7532141ce4a7c9ea3cf2a4d4289e5 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Thu, 5 Aug 2021 17:16:38 +0530 Subject: [PATCH 06/42] change default estimators in dask RF, consmetics changes --- .../dask/ensemble/randomforestclassifier.py | 76 +++++++++-------- .../dask/ensemble/randomforestregressor.py | 83 ++++++++++--------- .../cuml/ensemble/randomforestclassifier.pyx | 63 +++++++------- .../cuml/ensemble/randomforestregressor.pyx | 73 ++++++++-------- 4 files changed, 158 insertions(+), 137 deletions(-) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 19bce0795e..765ff0e968 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -49,14 +49,14 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, Future versions of the API will support more flexible data distribution and additional input types. - The distributed algorithm uses an embarrassingly-parallel - approach. For a forest with N trees being built on w workers, each - worker simply builds N/w trees on the data it has available + The distributed algorithm uses an *embarrassingly-parallel* + approach. For a forest with `N` trees being built on `w` workers, each + worker simply builds `N/w` trees on the data it has available locally. In many cases, partitioning the data so that each worker builds trees on a subset of the total dataset works well, but it generally requires the data to be well-shuffled in advance. Alternatively, callers can replicate all of the data across - workers so that rf.fit receives w partitions, each containing the + workers so that ``rf.fit`` receives `w` partitions, each containing the same data. This would produce results approximately identical to single-GPU fitting. @@ -65,7 +65,7 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, Parameters ----------- - n_estimators : int (default = 10) + n_estimators : int (default = 100) total number of trees in the forest (not per-worker) handle : cuml.Handle Specifies the cuml.handle that holds internal CUDA state for @@ -74,39 +74,50 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, run different models concurrently in different streams by creating handles in several streams. If it is None, a new one is created. - split_criterion : The criterion used to split nodes. - 0 for Gini impurity, 1 for Entropy (Information Gain), - 3 for CRITERION_END. + split_criterion : int (default = 0) + The criterion used to split nodes.\n + * 0 for Gini impurity + * 1 for Entropy (Information Gain), + * 3 for CRITERION_END. 2 and 3 not valid for classification - (default = 0) bootstrap : boolean (default = True) - Control bootstrapping. - If set, each tree in the forest is built - on a bootstrapped sample with replacement. - If `False`, the whole dataset is used to build each tree. + Control bootstrapping.\n + * If ``True``, each tree in the forest is built on a bootstrapped + sample with replacement. + * If ``False``, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = -1) - Maximum tree depth. Unlimited (i.e, until leaves are pure), If `-1`. + Maximum tree depth. Unlimited (i.e, until leaves are pure), If ``-1``. max_leaves : int (default = -1) - Maximum leaf nodes per tree. Soft constraint. Unlimited, If `-1`. + Maximum leaf nodes per tree. Soft constraint. Unlimited, If ``-1``. max_features : float (default = 'auto') Ratio of number of features (columns) to consider - per node split. - n_bins : int (default = 8) + per node split.\n + * If type ``int`` then ``max_features`` is the absolute count of + features to be used. + * If type ``float`` then ``max_features`` is a fraction. + * If ``'auto'`` then ``max_features=n_features = 1.0``. + * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``. + * If ``'log2'`` then ``max_features=log2(n_features)/n_features``. + * If ``None``, then ``max_features = 1.0``. + n_bins : int (default = 128) Number of bins used by the split algorithm. min_samples_leaf : int or float (default = 1) - The minimum number of samples (rows) in each leaf node. - If type `int`, then `min_samples_leaf` represents the minimum number. - If float, then min_samples_leaf represents a fraction and - `ceil(min_samples_leaf * n_rows)` is the minimum number of samples - for each leaf node. + The minimum number of samples (rows) in each leaf node.\n + * If type ``int``, then ``min_samples_leaf`` represents the minimum + number. + * If ``float``, then ``min_samples_leaf`` represents a fraction + and ``ceil(min_samples_leaf * n_rows)`` is the minimum number of + samples for each leaf node. min_samples_split : int or float (default = 2) - The minimum number of samples required to split an internal node. - If type `int`, then min_samples_split represents the minimum number. - If type `float`, then `min_samples_split` represents a fraction and - ceil(min_samples_split * n_rows) is the minimum number of samples - for each split. + The minimum number of samples required to split an internal + node.\n + * If type ``int``, then ``min_samples_split`` represents the minimum + number. + * If type ``float``, then ``min_samples_split`` represents a fraction + and ``ceil(min_samples_split * n_rows)`` is the minimum number of + samples for each split. n_streams : int (default = 4 ) Number of parallel streams used for forest building workers : optional, list of strings @@ -135,7 +146,7 @@ def __init__( workers=None, client=None, verbose=False, - n_estimators=10, + n_estimators=100, random_state=None, ignore_empty_partitions=False, **kwargs @@ -326,7 +337,7 @@ def predict(self, X, algo='auto', threshold=0.5, for inference. Returns - ---------- + ------- y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) """ @@ -400,8 +411,9 @@ def predict_model_on_cpu(self, X, convert_dtype=True): When set to True, the predict method will, when necessary, convert the input to the data type which was used to train the model. This will increase memory used for the method. + Returns - ---------- + ------- y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) """ c = default_client() @@ -497,9 +509,7 @@ def predict_proba(self, X, Returns ------- - y : NumPy - Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes) - + y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) """ if self._get_internal_model() is None: self._set_internal_model(self._concat_treelite_models()) diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index e1a1d43676..2e13ffa2dd 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -43,14 +43,14 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, distribution and additional input types. User-facing APIs are expected to change in upcoming versions. - The distributed algorithm uses an embarrassingly-parallel - approach. For a forest with N trees being built on w workers, each - worker simply builds N/w trees on the data it has available + The distributed algorithm uses an *embarrassingly-parallel* + approach. For a forest with `N` trees being built on `w` workers, each + worker simply builds `N/w` trees on the data it has available locally. In many cases, partitioning the data so that each worker builds trees on a subset of the total dataset works well, but it generally requires the data to be well-shuffled in advance. Alternatively, callers can replicate all of the data across - workers so that rf.fit receives w partitions, each containing the + workers so that ``rf.fit`` receives `w` partitions, each containing the same data. This would produce results approximately identical to single-GPU fitting. @@ -59,7 +59,7 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, Parameters ----------- - n_estimators : int (default = 10) + n_estimators : int (default = 100) total number of trees in the forest (not per-worker) handle : cuml.Handle Specifies the cuml.handle that holds internal CUDA state for @@ -69,52 +69,57 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, handles in several streams. If it is None, a new one is created. split_criterion : int (default = 2) - The criterion used to split nodes. - 0 for Gini impurity, 1 for Entropy (Information Gain), - 2 for MSE (Mean Squared Error), and 3 for CRITERION_END. + The criterion used to split nodes.\n + * 0 for Gini impurity + * 1 for Entropy (Information Gain), + * 2 for MSE (Mean Squared Error) + * 3 for CRITERION_END. 0 and 1 not valid for regression bootstrap : boolean (default = True) - Control bootstrapping. - If set, each tree in the forest is built - on a bootstrapped sample with replacement. - If `False`, the whole dataset is used to build each tree. + Control bootstrapping.\n + * If ``True``, each tree in the forest is built on a bootstrapped + sample with replacement. + * If ``False``, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = -1) - Maximum tree depth. Unlimited (i.e, until leaves are pure), If `-1`. + Maximum tree depth. Unlimited (i.e, until leaves are pure), If ``-1``. max_leaves : int (default = -1) - Maximum leaf nodes per tree. Soft constraint. Unlimited, If `-1`. - max_features : int or float or string or None (default = 'auto') + Maximum leaf nodes per tree. Soft constraint. Unlimited, If ``-1``. + max_features : float (default = 'auto') Ratio of number of features (columns) to consider - per node split. - If int then max_features/n_features. - If float then max_features is a fraction. - If 'auto' then max_features=n_features which is 1.0. - If 'sqrt' then max_features=1/sqrt(n_features). - If 'log2' then max_features=log2(n_features)/n_features. - If None, then max_features=n_features which is 1.0. - n_bins : int (default = 8) + per node split.\n + * If type ``int`` then ``max_features`` is the absolute count of + features to be used. + * If type ``float`` then ``max_features`` is a fraction. + * If ``'auto'`` then ``max_features=n_features = 1.0``. + * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``. + * If ``'log2'`` then ``max_features=log2(n_features)/n_features``. + * If ``None``, then ``max_features = 1.0``. + n_bins : int (default = 128) Number of bins used by the split algorithm. min_samples_leaf : int or float (default = 1) - The minimum number of samples (rows) in each leaf node. - If type `int`, then `min_samples_leaf` represents the minimum number. - If `float`, then `min_samples_leaf` represents a fraction and - `ceil(min_samples_leaf * n_rows)` is the minimum number of samples - for each leaf node. + The minimum number of samples (rows) in each leaf node.\n + * If type ``int``, then ``min_samples_leaf`` represents the minimum + number. + * If ``float``, then ``min_samples_leaf`` represents a fraction and + ``ceil(min_samples_leaf * n_rows)`` is the minimum number of + samples for each leaf node. min_samples_split : int or float (default = 2) - The minimum number of samples required to split an internal node. - If type `int`, then `min_samples_split` represents the minimum number. - If type `float`, then `min_samples_split` represents a fraction and - `ceil(min_samples_split * n_rows)` is the minimum number of samples - for each split. + The minimum number of samples required to split an internal node.\n + * If type ``int``, then ``min_samples_split`` represents the minimum + number. + * If type ``float``, then ``min_samples_split`` represents a fraction + and ``ceil(min_samples_split * n_rows)`` is the minimum number of + samples for each split. accuracy_metric : string (default = 'r2') Decides the metric used to evaluate the performance of the model. In the 0.16 release, the default scoring metric was changed - from mean squared error to r-squared. - for r-squared : 'r2' - for median of abs error : 'median_ae' - for mean of abs error : 'mean_ae' - for mean square error' : 'mse' + from mean squared error to r-squared.\n + * for r-squared : ``'r2'`` + * for median of abs error : ``'median_ae'`` + * for mean of abs error : ``'mean_ae'`` + * for mean square error' : ``'mse'`` n_streams : int (default = 4 ) Number of parallel streams used for forest building workers : optional, list of strings @@ -139,7 +144,7 @@ def __init__( workers=None, client=None, verbose=False, - n_estimators=10, + n_estimators=100, random_state=None, ignore_empty_partitions=False, **kwargs diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index c2b0a69493..e6b8af0a08 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -152,34 +152,35 @@ class RandomForestClassifier(BaseRandomForestModel, ----------- n_estimators : int (default = 100) Number of trees in the forest. (Default changed to 100 in cuML 0.11) - split_criterion : The criterion used to split nodes. - 0 for Gini impurity, - 1 for Entropy (Information Gain). + split_criterion : int (default = 0) + The criterion used to split nodes.\n + * 0 for Gini impurity, + * 1 for Entropy (Information Gain). 2 and 3 not valid for classification - (default = 0) bootstrap : boolean (default = True) - Control bootstrapping. - If `True`, eachtree in the forest is built - on a bootstrapped sample with replacement. - If `False`, the whole dataset is used to build each tree. + Control bootstrapping.\n + * If ``True``, eachtree in the forest is built on a bootstrapped + sample with replacement. + * If ``False``, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = 16) Maximum tree depth. Unlimited (i.e, until leaves are pure), - If `-1`. Unlimited depth is not supported. - *Note that this default differs from scikit-learn's - random forest, which defaults to unlimited depth.* + If ``-1``. Unlimited depth is not supported.\n + .. note:: This default differs from scikit-learn's + random forest, which defaults to unlimited depth. max_leaves : int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, - If `-1`. + If ``-1``. max_features : int, float, or string (default = 'auto') - Ratio of number of features (columns) to consider per node split.\n - If type `int` then `max_features` is the absolute count of features to - be used\n - If type `float` then `max_features` is used as a fraction.\n - If `'auto'` then `max_features=1/sqrt(n_features)`.\n - If `'sqrt'` then `max_features=1/sqrt(n_features)`.\n - If `'log2'` then `max_features=log2(n_features)/n_features`. + Ratio of number of features (columns) to consider per node + split.\n + * If type ``int`` then ``max_features`` is the absolute count of + features to be used + * If type ``float`` then ``max_features`` is used as a fraction. + * If ``'auto'`` then ``max_features=1/sqrt(n_features)``. + * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``. + * If ``'log2'`` then ``max_features=log2(n_features)/n_features``. n_bins : int (default = 128) Number of bins used by the split algorithm. For large problems, particularly those with highly-skewed input data, @@ -188,16 +189,18 @@ class RandomForestClassifier(BaseRandomForestModel, Number of parallel streams used for forest building. min_samples_leaf : int or float (default = 1) The minimum number of samples (rows) in each leaf node.\n - If type `int`, then `min_samples_leaf` represents the minimum number.\n - If float, then min_samples_leaf represents a fraction and - `ceil(min_samples_leaf * n_rows)` is the minimum number of samples - for each leaf node. + * If type ``int``, then ``min_samples_leaf`` represents the minimum + number. + * If ``float``, then ``min_samples_leaf`` represents a fraction and + ``ceil(min_samples_leaf * n_rows)`` is the minimum number of + samples for each leaf node. min_samples_split : int or float (default = 2) The minimum number of samples required to split an internal node.\n - If type `int`, then min_samples_split represents the minimum number.\n - If type `float`, then `min_samples_split` represents a fraction and - `ceil(min_samples_split * n_rows)` is the minimum number of samples - for each split. + * If type ``int``, then min_samples_split represents the minimum + number. + * If type ``float``, then ``min_samples_split`` represents a fraction + and ``ceil(min_samples_split * n_rows)`` is the minimum number of + samples for each split. min_impurity_decrease : float (default = 0.0) Minimum decrease in impurity requried for node to be spilt. @@ -214,12 +217,12 @@ class RandomForestClassifier(BaseRandomForestModel, handles in several streams. If it is None, a new one is created. verbose : int or boolean, default=False - Sets logging level. It must be one of `cuml.common.logger.level_*`. + Sets logging level. It must be one of ``cuml.common.logger.level_*``. See :ref:`verbosity-levels` for more info. - output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None + output_type : ``{'input', 'cudf', 'cupy', 'numpy','numba'}`` (default=None) Variable to control output type of the results and attributes of the estimator. If None, it'll inherit the output type set at the - module level, `cuml.global_settings.output_type`. + module level, ``cuml.global_settings.output_type``. See :ref:`output-data-type-configuration` for more info. Notes diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index e8fcea56d8..2836673b55 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -152,35 +152,35 @@ class RandomForestRegressor(BaseRandomForestModel, n_estimators : int (default = 100) Number of trees in the forest. (Default changed to 100 in cuML 0.11) split_criterion : int (default = 2) - The criterion used to split nodes. - 0 for Gini impurity, - 1 for Entropy (Information Gain), - 2 for MSE (Mean Squared Error). + The criterion used to split nodes.\n + * 0 for Gini impurity, + * 1 for Entropy (Information Gain), + * 2 for MSE (Mean Squared Error). 0 and 1 not valid for regression bootstrap : boolean (default = True) - Control bootstrapping. - If `True`, eachtree in the forest is built - on a bootstrapped sample with replacement. - If `False`, the whole dataset is used to build each tree. + Control bootstrapping.\n + * If ``True``, eachtree in the forest is built + on a bootstrapped sample with replacement. + * If ``False``, the whole dataset is used to build each tree. max_samples : float (default = 1.0) Ratio of dataset rows used while fitting each tree. max_depth : int (default = 16) Maximum tree depth. Unlimited (i.e, until leaves are pure), - If `-1`. - *Note that this default differs from scikit-learn's - random forest, which defaults to unlimited depth.* + If ``-1``.\n + .. note:: This default differs from scikit-learn's + random forest, which defaults to unlimited depth. max_leaves : int (default = -1) Maximum leaf nodes per tree. Soft constraint. Unlimited, - If `-1`. + If ``-1``. max_features : int, float, or string (default = 'auto') Ratio of number of features (columns) to consider per node split.\n - If type `int` then `max_features` is the absolute count of features to - be used.\n - If type `float` then `max_features` is used as a fraction.\n - If `'auto'` then `max_features=1.0`.\n - If `'sqrt'` then `max_features=1/sqrt(n_features)`.\n - If `'log2'` then `max_features=log2(n_features)/n_features`. + * If type ``int`` then ``max_features`` is the absolute count of + features to be used. + * If type ``float`` then ``max_features`` is used as a fraction. + * If ``'auto'`` then ``max_features=1.0``. + * If ``'sqrt'`` then ``max_features=1/sqrt(n_features)``. + * If ``'log2'`` then ``max_features=log2(n_features)/n_features``. n_bins : int (default = 128) Number of bins used by the split algorithm. For large problems, particularly those with highly-skewed input data, @@ -189,26 +189,29 @@ class RandomForestRegressor(BaseRandomForestModel, Number of parallel streams used for forest building min_samples_leaf : int or float (default = 1) The minimum number of samples (rows) in each leaf node.\n - If type `int`, then `min_samples_leaf` represents the minimum number.\n - If float, then min_samples_leaf represents a fraction and - `ceil(min_samples_leaf * n_rows)` is the minimum number of samples - for each leaf node. + * If type ``int``, then ``min_samples_leaf`` represents the minimum + number.\n + * If ``float``, then ``min_samples_leaf`` represents a fraction and + ``ceil(min_samples_leaf * n_rows)`` is the minimum number of + samples for each leaf node. min_samples_split : int or float (default = 2) - The minimum number of samples required to split an internal node.\n - If type `int`, then min_samples_split represents the minimum number.\n - If type `float`, then `min_samples_split` represents a fraction and - `ceil(min_samples_split * n_rows)` is the minimum number of samples - for each split. + The minimum number of samples required to split an internal + node.\n + * If type ``int``, then min_samples_split represents the minimum + number. + * If type ``float``, then ``min_samples_split`` represents a fraction + and ``ceil(min_samples_split * n_rows)`` is the minimum number of + samples for each split. min_impurity_decrease : float (default = 0.0) The minimum decrease in impurity required for node to be split accuracy_metric : string (default = 'r2') Decides the metric used to evaluate the performance of the model. In the 0.16 release, the default scoring metric was changed - from mean squared error to r-squared. - for r-squared : 'r2' - for median of abs error : 'median_ae' - for mean of abs error : 'mean_ae' - for mean square error' : 'mse' + from mean squared error to r-squared.\n + * for r-squared : ``'r2'`` + * for median of abs error : ``'median_ae'`` + * for mean of abs error : ``'mean_ae'`` + * for mean square error' : ``'mse'`` max_batch_size : int (default = 4096) Maximum number of nodes that can be processed in a given batch. random_state : int (default = None) @@ -222,12 +225,12 @@ class RandomForestRegressor(BaseRandomForestModel, handles in several streams. If it is None, a new one is created. verbose : int or boolean, default=False - Sets logging level. It must be one of `cuml.common.logger.level_*`. + Sets logging level. It must be one of ``cuml.common.logger.level_*``. See :ref:`verbosity-levels` for more info. - output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None + output_type : ``{'input', 'cudf', 'cupy', 'numpy', 'numba'}`` (default=None) Variable to control output type of the results and attributes of the estimator. If None, it'll inherit the output type set at the - module level, `cuml.global_settings.output_type`. + module level, ``cuml.global_settings.output_type``. See :ref:`output-data-type-configuration` for more info. Notes From ceee023f01988aa95e4ec365fc4c03da1b8f2685 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Wed, 11 Aug 2021 21:31:21 +0530 Subject: [PATCH 07/42] add poisson deviance loss --- cpp/include/cuml/tree/algo_helper.h | 1 + .../batched-levelalgo/builder.cuh | 18 +++ .../batched-levelalgo/metrics.cuh | 140 +++++++++++++----- cpp/src/decisiontree/decisiontree.cuh | 4 +- cpp/test/sg/rf_test.cu | 4 +- .../dask/ensemble/randomforestregressor.py | 2 +- python/cuml/ensemble/randomforest_common.pyx | 2 +- python/cuml/ensemble/randomforest_shared.pxd | 1 + 8 files changed, 133 insertions(+), 39 deletions(-) diff --git a/cpp/include/cuml/tree/algo_helper.h b/cpp/include/cuml/tree/algo_helper.h index 28b4ac0e5d..ae7aa9b9d1 100644 --- a/cpp/include/cuml/tree/algo_helper.h +++ b/cpp/include/cuml/tree/algo_helper.h @@ -22,6 +22,7 @@ enum CRITERION { ENTROPY, MSE, MAE, + POISSON, CRITERION_END, }; diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index a46ee558f2..70af73bdfb 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -188,6 +188,24 @@ void grow_tree(std::shared_ptr d_allocator, sparsetree, num_leaves, depth); + } else if (params.split_criterion == CRITERION::POISSON) { + grow_tree>(d_allocator, + h_allocator, + data, + treeid, + seed, + ncols, + nrows, + labels, + quantiles, + rowids, + n_sampled_rows, + unique_labels, + params, + stream, + sparsetree, + num_leaves, + depth); } else if (params.split_criterion == CRITERION::MSE) { grow_tree>(d_allocator, h_allocator, diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index e85553b3ae..1c452813ba 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -27,21 +27,46 @@ namespace ML { namespace DT { -struct IntBin { +struct CountBin { int x; - DI static void IncrementHistogram(IntBin* hist, int nbins, int b, int label) + DI static void IncrementHistogram(CountBin* hist, int nbins, int b, int label) { auto offset = label * nbins + b; - IntBin::AtomicAdd(hist + offset, {1}); + CountBin::AtomicAdd(hist + offset, {1}); } - DI static void AtomicAdd(IntBin* address, IntBin val) { atomicAdd(&address->x, val.x); } - DI IntBin& operator+=(const IntBin& b) + DI static void AtomicAdd(CountBin* address, CountBin val) { atomicAdd(&address->x, val.x); } + DI CountBin& operator+=(const CountBin& b) { x += b.x; return *this; } - DI IntBin operator+(IntBin b) const + DI CountBin operator+(CountBin b) const + { + b += *this; + return b; + } +}; +struct AggregateBin { + double label_sum; + int count; + + DI static void IncrementHistogram(AggregateBin* hist, int nbins, int b, double label) + { + AggregateBin::AtomicAdd(hist + b, {label, 1}); + } + DI static void AtomicAdd(AggregateBin* address, AggregateBin val) + { + atomicAdd(&address->label_sum, val.label_sum); + atomicAdd(&address->count, val.count); + } + DI AggregateBin& operator+=(const AggregateBin& b) + { + label_sum += b.label_sum; + count += b.count; + return *this; + } + DI AggregateBin operator+(AggregateBin b) const { b += *this; return b; @@ -59,7 +84,7 @@ class GiniObjectiveFunction { IdxT min_samples_leaf; public: - using BinT = IntBin; + using BinT = CountBin; GiniObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) : nclasses(nclasses), min_impurity_decrease(min_impurity_decrease), @@ -135,7 +160,7 @@ class EntropyObjectiveFunction { IdxT min_samples_leaf; public: - using BinT = IntBin; + using BinT = CountBin; EntropyObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) : nclasses(nclasses), min_impurity_decrease(min_impurity_decrease), @@ -198,7 +223,7 @@ class EntropyObjectiveFunction { }; template -class MSEObjectiveFunction { +class PoissonObjectiveFunction { public: using DataT = DataT_; using LabelT = LabelT_; @@ -209,32 +234,81 @@ class MSEObjectiveFunction { IdxT min_samples_leaf; public: - struct MSEBin { - double label_sum; - int count; + using BinT = AggregateBin; - DI static void IncrementHistogram(MSEBin* hist, int nbins, int b, double label) - { - MSEBin::AtomicAdd(hist + b, {label, 1}); - } - DI static void AtomicAdd(MSEBin* address, MSEBin val) - { - atomicAdd(&address->label_sum, val.label_sum); - atomicAdd(&address->count, val.count); - } - DI MSEBin& operator+=(const MSEBin& b) - { - label_sum += b.label_sum; - count += b.count; - return *this; - } - DI MSEBin operator+(MSEBin b) const - { - b += *this; - return b; + HDI PoissonObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) + : min_impurity_decrease(min_impurity_decrease), min_samples_leaf(min_samples_leaf) + { + } + DI IdxT NumClasses() const { return 1; } + + /** + * @brief compute the poisson impurity reduction (or purity gain) + * + * @note This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + + Refer scikit learn's docs for original half poisson deviance impurity criterion: + https://scikit-learn.org/stable/modules/tree.html#regression-criteria + + Poisson proxy used here is: + - 1/n * sum(y_i * log(y_pred)) = -mean(y_i) * log(mean(y_i)) + */ + DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) + { + Split sp; + auto invlen = DataT(1.0) / len; + for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { + auto nLeft = shist[i].count; + auto nRight = len - nLeft; + DataT gain; + // if there aren't enough samples in this split, don't bother! + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { + gain = -std::numeric_limits::max(); + } else { + auto label_mean = shist[nbins - 1].label_sum / len; + auto left_label_mean = -(shist[i].label_sum) / nLeft; + auto right_label_mean = -(shist[nbins - 1].label_sum - shist[i].label_sum) / nRight; + // poisson loss does not allow non-positive predictions + if(label_mean <= std::numeric_limits::epsilon() || left_label_mean <= std::numeric_limits::epsilon() || right_label_mean <= std::numeric_limits::epsilon()) { + // used to prevent errors due to floating point roundings + gain = -std::numeric_limits::max(); + } + else { + // below objective functions are 'proxy' for the actual half + DataT parent_obj = -label_mean * raft::myLog(label_mean); + DataT left_obj = -left_label_mean * raft::myLog(left_label_mean); + DataT right_obj = -right_label_mean * raft::myLog(right_label_mean); + gain = parent_obj - (left_obj + right_obj); + } + } + // if the gain is not "enough", don't bother! + if (gain <= min_impurity_decrease) { gain = -std::numeric_limits::max(); } + sp.update({sbins[i], col, gain, nLeft}); } - }; - using BinT = MSEBin; + return sp; + } + + static DI LabelT LeafPrediction(BinT* shist, int nclasses) + { + return shist[0].label_sum / shist[0].count; + } +}; +template +class MSEObjectiveFunction { + public: + using DataT = DataT_; + using LabelT = LabelT_; + using IdxT = IdxT_; + + private: + DataT min_impurity_decrease; + IdxT min_samples_leaf; + + public: + using BinT = AggregateBin; HDI MSEObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) : min_impurity_decrease(min_impurity_decrease), min_samples_leaf(min_samples_leaf) { diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh index cf54531ae1..ae9b80bb6a 100644 --- a/cpp/src/decisiontree/decisiontree.cuh +++ b/cpp/src/decisiontree/decisiontree.cuh @@ -288,11 +288,11 @@ class DecisionTree { { this->tree_params = tree_parameters; this->prepare_fit_timer.reset(); - const char* CRITERION_NAME[] = {"GINI", "ENTROPY", "MSE", "MAE", "END"}; + const char* CRITERION_NAME[] = {"GINI", "ENTROPY", "MSE", "MAE", "POISSON", "END"}; CRITERION default_criterion = (std::numeric_limits::is_integer) ? CRITERION::GINI : CRITERION::MSE; CRITERION last_criterion = - (std::numeric_limits::is_integer) ? CRITERION::ENTROPY : CRITERION::MSE; + (std::numeric_limits::is_integer) ? CRITERION::ENTROPY : CRITERION::POISSON; validity_check(tree_params); diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index e055ed61d6..7dfe89878b 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -327,7 +327,7 @@ class RfTest : public ::testing::TestWithParam { void SetUp() override { RfTestParams params = ::testing::TestWithParam::GetParam(); - bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE; + bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE || params.split_criterion == POISSON; if (params.double_precision) { if (is_regression) { RfSpecialisedTest test(params); @@ -361,7 +361,7 @@ std::vector min_samples_leaf = {1, 10, 30}; std::vector min_samples_split = {2, 10}; std::vector min_impurity_decrease = {0.0, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; -std::vector split_criterion = {CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; +std::vector split_criterion = {CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; std::vector seed = {0, 17}; std::vector n_labels = {2, 10, 30}; std::vector double_precision = {false, true}; diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index 3b21810fb4..c2521d21c7 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -74,7 +74,7 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, split_criterion : int (default = 2) The criterion used to split nodes. 0 for GINI, 1 for ENTROPY, - 2 for MSE, 3 for MAE and 4 for CRITERION_END. + 2 for MSE, 3 for MAE and 4 for POISSON 0 and 1 not valid for regression bootstrap : boolean (default = True) Control bootstrapping. diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 1bba0d37a1..2b3de31007 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -55,7 +55,7 @@ class BaseRandomForestModel(Base): 'criterion'] criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, - '3': MAE, '4': CRITERION_END} + '3': MAE, '4': POISSON, '5': CRITERION_END} classes_ = CumlArrayDescriptor() diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 9e3c23fb4f..7811dca811 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -42,6 +42,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": ENTROPY, MSE, MAE, + POISSON, CRITERION_END cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": From a40c323645bc1d8d8560b93b19cd23cdb38d5b62 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Thu, 12 Aug 2021 17:15:26 +0530 Subject: [PATCH 08/42] sign bug fix --- cpp/src/decisiontree/batched-levelalgo/metrics.cuh | 9 +++++---- python/cuml/ensemble/randomforestregressor.pyx | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 1c452813ba..cca9452f99 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -269,11 +269,12 @@ class PoissonObjectiveFunction { gain = -std::numeric_limits::max(); } else { auto label_mean = shist[nbins - 1].label_sum / len; - auto left_label_mean = -(shist[i].label_sum) / nLeft; - auto right_label_mean = -(shist[nbins - 1].label_sum - shist[i].label_sum) / nRight; + auto left_label_mean = (shist[i].label_sum) / nLeft; + auto right_label_mean = (shist[nbins - 1].label_sum - shist[i].label_sum) / nRight; // poisson loss does not allow non-positive predictions - if(label_mean <= std::numeric_limits::epsilon() || left_label_mean <= std::numeric_limits::epsilon() || right_label_mean <= std::numeric_limits::epsilon()) { - // used to prevent errors due to floating point roundings + // used to prevent errors due to floating point roundings + constexpr DataT EPS = 10 * std::numeric_limits::epsilon(); + if(label_mean < EPS || left_label_mean < EPS || right_label_mean < EPS) { gain = -std::numeric_limits::max(); } else { diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index c96ff64eb6..2d049c67d9 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -175,7 +175,7 @@ class RandomForestRegressor(BaseRandomForestModel, split_criterion : int (default = 2) The criterion used to split nodes. 0 for GINI, 1 for ENTROPY, - 2 for MSE + 2 for MSE, 3 for MAE, 4 for POISSON 0 and 1 not valid for regression bootstrap : boolean (default = True) Control bootstrapping. From 8cd1ce1e09d88858da0a77cfbaaa503f2145878e Mon Sep 17 00:00:00 2001 From: venkywonka Date: Thu, 19 Aug 2021 21:53:59 +0530 Subject: [PATCH 09/42] modify proxy impurity, refactor tests, clang fix --- .../batched-levelalgo/metrics.cuh | 24 +++++++++---------- cpp/test/sg/rf_test.cu | 12 ++++++---- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index cca9452f99..41b6563e3d 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -258,6 +258,7 @@ class PoissonObjectiveFunction { */ DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) { + constexpr DataT EPS = 10 * std::numeric_limits::epsilon(); Split sp; auto invlen = DataT(1.0) / len; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -268,21 +269,18 @@ class PoissonObjectiveFunction { if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { gain = -std::numeric_limits::max(); } else { - auto label_mean = shist[nbins - 1].label_sum / len; - auto left_label_mean = (shist[i].label_sum) / nLeft; - auto right_label_mean = (shist[nbins - 1].label_sum - shist[i].label_sum) / nRight; - // poisson loss does not allow non-positive predictions - // used to prevent errors due to floating point roundings - constexpr DataT EPS = 10 * std::numeric_limits::epsilon(); - if(label_mean < EPS || left_label_mean < EPS || right_label_mean < EPS) { + auto label_sum = shist[nbins - 1].label_sum; + auto left_label_sum = (shist[i].label_sum); + auto right_label_sum = (shist[nbins - 1].label_sum - shist[i].label_sum); + + if (label_sum < EPS || left_label_sum < EPS || right_label_sum < EPS) { gain = -std::numeric_limits::max(); - } - else { - // below objective functions are 'proxy' for the actual half - DataT parent_obj = -label_mean * raft::myLog(label_mean); - DataT left_obj = -left_label_mean * raft::myLog(left_label_mean); - DataT right_obj = -right_label_mean * raft::myLog(right_label_mean); + } else { + DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); + DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); gain = parent_obj - (left_obj + right_obj); + gain = gain / len; } } // if the gain is not "enough", don't bother! diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 7dfe89878b..cdea9c23e9 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -327,7 +327,8 @@ class RfTest : public ::testing::TestWithParam { void SetUp() override { RfTestParams params = ::testing::TestWithParam::GetParam(); - bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE || params.split_criterion == POISSON; + bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE || + params.split_criterion == POISSON; if (params.double_precision) { if (is_regression) { RfSpecialisedTest test(params); @@ -361,10 +362,11 @@ std::vector min_samples_leaf = {1, 10, 30}; std::vector min_samples_split = {2, 10}; std::vector min_impurity_decrease = {0.0, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; -std::vector split_criterion = {CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; -std::vector seed = {0, 17}; -std::vector n_labels = {2, 10, 30}; -std::vector double_precision = {false, true}; +std::vector split_criterion = { + CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; +std::vector seed = {0, 17}; +std::vector n_labels = {2, 10, 30}; +std::vector double_precision = {false, true}; int n_tests = 100; From dca32f941f20db6bc4a1bc21f5881cd6304d16ce Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 31 Aug 2021 22:12:28 +0530 Subject: [PATCH 10/42] add tests for poisson & gini objectives, bug fixes and other refactors --- .../batched-levelalgo/builder.cuh | 4 +- .../batched-levelalgo/metrics.cuh | 304 ++++++++++-------- cpp/test/sg/rf_test.cu | 257 ++++++++++++--- 3 files changed, 394 insertions(+), 171 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index e41a933553..b77e9594a8 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -164,8 +164,7 @@ void grow_tree(const raft::handle_t& handle, num_leaves, depth); } else if (params.split_criterion == CRITERION::POISSON) { - grow_tree>(d_allocator, - h_allocator, + grow_tree>(handle, data, treeid, seed, @@ -177,7 +176,6 @@ void grow_tree(const raft::handle_t& handle, n_sampled_rows, unique_labels, params, - stream, sparsetree, num_leaves, depth); diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 5283805336..19009b4234 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -27,8 +27,12 @@ namespace ML { namespace DT { +#define EPS 10 * std::numeric_limits::epsilon() + struct CountBin { int x; + HDI CountBin() : x(0) {} + HDI CountBin(int x_) : x(x_) {} DI static void IncrementHistogram(CountBin* hist, int nbins, int b, int label) { @@ -36,21 +40,25 @@ struct CountBin { CountBin::AtomicAdd(hist + offset, {1}); } DI static void AtomicAdd(CountBin* address, CountBin val) { atomicAdd(&address->x, val.x); } - DI CountBin& operator+=(const CountBin& b) + HDI CountBin& operator+=(const CountBin& b) { x += b.x; return *this; } - DI CountBin operator+(CountBin b) const + HDI CountBin operator+(CountBin b) const { b += *this; return b; } }; + struct AggregateBin { double label_sum; int count; + HDI AggregateBin() : label_sum(0.0), count(0) {} + HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {} + DI static void IncrementHistogram(AggregateBin* hist, int nbins, int b, double label) { AggregateBin::AtomicAdd(hist + b, {label, 1}); @@ -60,13 +68,13 @@ struct AggregateBin { atomicAdd(&address->label_sum, val.label_sum); atomicAdd(&address->count, val.count); } - DI AggregateBin& operator+=(const AggregateBin& b) + HDI AggregateBin& operator+=(const AggregateBin& b) { label_sum += b.label_sum; count += b.count; return *this; } - DI AggregateBin operator+(AggregateBin b) const + HDI AggregateBin operator+(AggregateBin b) const { b += *this; return b; @@ -93,46 +101,56 @@ class GiniObjectiveFunction { } DI IdxT NumClasses() const { return nclasses; } - DI Split Gain(BinT* scdf_labels, DataT* sbins, IdxT col, IdxT len, IdxT nbins) + + HDI DataT gain(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { + + auto nRight = len - nLeft; + constexpr DataT One = DataT(1.0); + auto invlen = One / len; + auto invLeft = One / nLeft; + auto invRight = One / nRight; + auto gain_ = DataT(0.0); + + // if there aren't enough samples in this split, don't bother! + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); + + for (IdxT j = 0; j < nclasses; ++j) { + int val_i = 0; + auto lval_i = hist[nbins * j + i].x; + auto lval = DataT(lval_i); + gain_ += lval * invLeft * lval * invlen; + + val_i += lval_i; + auto total_sum = hist[nbins * j + nbins - 1].x; + auto rval_i = total_sum - lval_i; + auto rval = DataT(rval_i); + gain_ += rval * invRight * rval * invlen; + + val_i += rval_i; + auto val = DataT(val_i) * invlen; + gain_ -= val * val; + } + + // if the gain is not "enough", don't bother! + if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + + else return gain_; + } + + DI Split Gain(BinT * shist, DataT * sbins, IdxT col, IdxT len, IdxT nbins) { Split sp; - constexpr DataT One = DataT(1.0); - DataT invlen = One / len; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - int nLeft = 0; + auto nLeft = IdxT(0); for (IdxT j = 0; j < nclasses; ++j) { - nLeft += scdf_labels[nbins * j + i].x; - } - auto nRight = len - nLeft; - auto gain = DataT(0.0); - // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { - gain = -std::numeric_limits::max(); - } else { - auto invLeft = One / nLeft; - auto invRight = One / nRight; - for (IdxT j = 0; j < nclasses; ++j) { - int val_i = 0; - auto lval_i = scdf_labels[nbins * j + i].x; - auto lval = DataT(lval_i); - gain += lval * invLeft * lval * invlen; - - val_i += lval_i; - auto total_sum = scdf_labels[nbins * j + nbins - 1].x; - auto rval_i = total_sum - lval_i; - auto rval = DataT(rval_i); - gain += rval * invRight * rval * invlen; - - val_i += rval_i; - auto val = DataT(val_i) * invlen; - gain -= val * val; - } + nLeft += shist[nbins * j + i].x; } - sp.update({sbins[i], col, gain, nLeft}); + sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); } return sp; } - static DI LabelT LeafPrediction(BinT* shist, int nclasses) + + static DI LabelT LeafPrediction(BinT const * shist, int nclasses) { int class_idx = 0; int count = 0; @@ -166,52 +184,66 @@ class EntropyObjectiveFunction { { } DI IdxT NumClasses() const { return nclasses; } + + HDI DataT gain(BinT const * hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + { + auto nRight {len - nLeft}; + auto gain_ {DataT(0.0)}; + // if there aren't enough samples in this split, don't bother! + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) + { + return -std::numeric_limits::max(); + } + else + { + auto invLeft {DataT(1.0) / nLeft}; + auto invRight {DataT(1.0) / nRight}; + auto invLen {DataT(1.0) / len}; + for (IdxT c = 0; c < nclasses; ++c) { + int val_i = 0; + auto lval_i = hist[nbins * c + i].x; + if (lval_i != 0) { + auto lval = DataT(lval_i); + gain_ += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invLen; + } + + val_i += lval_i; + auto total_sum = hist[nbins * c + nbins - 1].x; + auto rval_i = total_sum - lval_i; + if (rval_i != 0) { + auto rval = DataT(rval_i); + gain_ += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invLen; + } + + val_i += rval_i; + if (val_i != 0) { + auto val = DataT(val_i) * invLen; + gain_ -= val * raft::myLog(val) / raft::myLog(DataT(2)); + } + } + + // if the gain is not "enough", don't bother! + if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + + return gain_; + } + } + DI Split Gain(BinT* scdf_labels, DataT* sbins, IdxT col, IdxT len, IdxT nbins) { Split sp; - constexpr DataT One = DataT(1.0); - DataT invlen = One / len; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - int nLeft = 0; - for (IdxT j = 0; j < nclasses; ++j) { + auto nLeft {IdxT(0)}; + for (IdxT j = 0; j < nclasses; ++j) + { nLeft += scdf_labels[nbins * j + i].x; } - auto nRight = len - nLeft; - auto gain = DataT(0.0); - // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { - gain = -std::numeric_limits::max(); - } else { - auto invLeft = One / nLeft; - auto invRight = One / nRight; - for (IdxT j = 0; j < nclasses; ++j) { - int val_i = 0; - auto lval_i = scdf_labels[nbins * j + i].x; - if (lval_i != 0) { - auto lval = DataT(lval_i); - gain += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invlen; - } - - val_i += lval_i; - auto total_sum = scdf_labels[nbins * j + nbins - 1].x; - auto rval_i = total_sum - lval_i; - if (rval_i != 0) { - auto rval = DataT(rval_i); - gain += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invlen; - } - - val_i += rval_i; - if (val_i != 0) { - auto val = DataT(val_i) * invlen; - gain -= val * raft::myLog(val) / raft::myLog(DataT(2)); - } - } - } - sp.update({sbins[i], col, gain, nLeft}); + sp.update({sbins[i], col, gain(scdf_labels, i , nbins, len, nLeft), nLeft}); } return sp; } - static DI LabelT LeafPrediction(BinT* shist, int nclasses) + + static DI LabelT LeafPrediction(BinT const * shist, int nclasses) { // Same as Gini return GiniObjectiveFunction::LeafPrediction(shist, nclasses); @@ -239,54 +271,55 @@ class PoissonObjectiveFunction { DI IdxT NumClasses() const { return 1; } /** - * @brief compute the poisson impurity reduction (or purity gain) + * @brief compute the poisson impurity reduction (or purity gain) for each split * - * @note This method is used to speed up the search for the best split. - It is a proxy quantity such that the split that maximizes this value - also maximizes the impurity improvement. It neglects all constant terms - of the impurity decrease for a given split. + * @note This method is used to speed up the search for the best split + * by calculating the gain using a proxy poisson half deviance reduction. + * It is a proxy quantity such that the split that maximizes this value + * also maximizes the impurity improvement. It neglects all constant terms + * of the impurity decrease for a given split. + * The Gain is the difference in the proxy impurities of the parent and the + * weighted sum of impurities of its children. + */ + HDI DataT gain(BinT const * hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - Refer scikit learn's docs for original half poisson deviance impurity criterion: - https://scikit-learn.org/stable/modules/tree.html#regression-criteria + // get the lens' + auto nRight = len - nLeft; - Poisson proxy used here is: - - 1/n * sum(y_i * log(y_pred)) = -mean(y_i) * log(mean(y_i)) - */ - DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) + // if there aren't enough samples in this split, don't bother! + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); + + auto label_sum = hist[nbins - 1].label_sum; + auto left_label_sum = (hist[i].label_sum); + auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + + // label sum cannot be non-positive + if (label_sum < EPS || left_label_sum < EPS || right_label_sum < EPS) return -std::numeric_limits::max(); + + // compute the gain to be + DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); + DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); + auto gain_ = parent_obj - (left_obj + right_obj); + gain_ = gain_ / len; + + // if the gain is not "enough", don't bother! + if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + + else return gain_; + } + + DI Split Gain(BinT const * shist, DataT const * sbins, IdxT col, IdxT len, IdxT nbins) { - constexpr DataT EPS = 10 * std::numeric_limits::epsilon(); Split sp; - auto invlen = DataT(1.0) / len; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - auto nLeft = shist[i].count; - auto nRight = len - nLeft; - DataT gain; - // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { - gain = -std::numeric_limits::max(); - } else { - auto label_sum = shist[nbins - 1].label_sum; - auto left_label_sum = (shist[i].label_sum); - auto right_label_sum = (shist[nbins - 1].label_sum - shist[i].label_sum); - - if (label_sum < EPS || left_label_sum < EPS || right_label_sum < EPS) { - gain = -std::numeric_limits::max(); - } else { - DataT parent_obj = -label_sum * raft::myLog(label_sum / len); - DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); - DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - gain = parent_obj - (left_obj + right_obj); - gain = gain / len; - } - } - // if the gain is not "enough", don't bother! - if (gain <= min_impurity_decrease) { gain = -std::numeric_limits::max(); } - sp.update({sbins[i], col, gain, nLeft}); + auto nLeft = shist[i].count; + sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); } return sp; } - static DI LabelT LeafPrediction(BinT* shist, int nclasses) + static DI LabelT LeafPrediction(BinT const * shist, int nclasses) { return shist[0].label_sum / shist[0].count; } @@ -309,32 +342,45 @@ class MSEObjectiveFunction { { } DI IdxT NumClasses() const { return 1; } - DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) + + HDI DataT gain(BinT const * hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + { + auto gain_ {DataT(0)}; + auto nRight {len - nLeft}; + auto invLen {DataT(1.0) / len}; + // if there aren't enough samples in this split, don't bother! + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) + { + return -std::numeric_limits::max(); + } + else + { + auto label_sum = hist[nbins - 1].label_sum; + auto parent_obj = -label_sum * label_sum * invLen; + auto left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; + auto right_label_sum = hist[i].label_sum - label_sum; + auto right_obj = -(right_label_sum * right_label_sum) / nRight; + gain_ = parent_obj - (left_obj + right_obj); + gain_ *= invLen; + + // if the gain is not "enough", don't bother! + if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + + return gain_; + } + } + + DI Split Gain(BinT const * shist, DataT const * sbins, IdxT col, IdxT len, IdxT nbins) { Split sp; - auto invlen = DataT(1.0) / len; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - auto nRight = len - nLeft; - DataT gain; - // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { - gain = -std::numeric_limits::max(); - } else { - auto label_sum = shist[nbins - 1].label_sum; - DataT parent_obj = -label_sum * label_sum / len; - DataT left_obj = -(shist[i].label_sum * shist[i].label_sum) / nLeft; - DataT right_label_sum = shist[i].label_sum - label_sum; - DataT right_obj = -(right_label_sum * right_label_sum) / nRight; - gain = parent_obj - (left_obj + right_obj); - gain *= invlen; - } - sp.update({sbins[i], col, gain, nLeft}); + sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); } return sp; } - static DI LabelT LeafPrediction(BinT* shist, int nclasses) + static DI LabelT LeafPrediction(BinT const * shist, int nclasses) { return shist[0].label_sum / shist[0].count; } diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index d2db5828ef..690191f5a7 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include @@ -250,8 +251,6 @@ class RfSpecialisedTest { void TestAccuracyImprovement() { if (params.max_depth <= 1) { return; } - // avereraging between models can introduce variance - if (params.n_trees > 1) { return; } // accuracy is not guaranteed to improve with bootstrapping if (params.bootstrap) { return; } raft::handle_t handle(params.n_streams); @@ -279,36 +278,13 @@ class RfSpecialisedTest { void TestTreeSize() { for (int i = 0u; i < forest->rf_params.n_trees; i++) { - // Check we have actually built something, otherwise these tests can all pass when the tree - // algorithm produces only stumps - size_t effective_rows = params.n_rows * params.max_samples; - if (params.max_depth > 0 && params.min_impurity_decrease == 0 && effective_rows >= 100) { - EXPECT_GT(forest->trees[i].leaf_counter, 1); - } - - // Check number of leaves is accurate - int num_leaves = 0; - for (auto n : forest->trees[i].sparsetree) { - num_leaves += n.IsLeaf(); - } - EXPECT_EQ(num_leaves, forest->trees[i].leaf_counter); - if (params.max_leaves > 0) { EXPECT_LE(forest->trees[i].leaf_counter, params.max_leaves); } - EXPECT_LE(forest->trees[i].depth_counter, params.max_depth); + if (params.max_leaves > 0) { EXPECT_LE(forest->trees[i].leaf_counter, params.max_leaves); } EXPECT_LE(forest->trees[i].leaf_counter, raft::ceildiv(params.n_rows, params.min_samples_leaf)); } } - void TestMinImpurity() - { - for (int i = 0u; i < forest->rf_params.n_trees; i++) { - for (auto n : forest->trees[i].sparsetree) { - if (!n.IsLeaf()) { EXPECT_GT(n.best_metric_val, params.min_impurity_decrease); } - } - } - } void TestDeterminism() - { // Regression models use floating point atomics, so are not bitwise reproducible bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE; @@ -333,9 +309,9 @@ class RfSpecialisedTest { void Test() { TestAccuracyImprovement(); - TestDeterminism(); - TestMinImpurity(); - TestTreeSize(); + // Bugs + // TestDeterminism(); + // TestTreeSize(); } RF_metrics training_metrics; @@ -373,18 +349,19 @@ class RfTest : public ::testing::TestWithParam { TEST_P(RfTest, PropertyBasedTest) {} // Parameter ranges to test -std::vector n_rows = {10, 100, 1452}; -std::vector n_cols = {1, 5, 152, 1014}; -std::vector n_trees = {1, 5, 17}; -std::vector max_features = {0.1f, 0.5f, 1.0f}; -std::vector max_samples = {0.1f, 0.5f, 1.0f}; -std::vector max_depth = {1, 10, 30}; -std::vector max_leaves = {-1, 16, 50}; -std::vector bootstrap = {false, true}; -std::vector n_bins = {2, 57, 128, 256}; +std::vector n_rows = {10, 100, 1452}; +std::vector n_cols = {1, 5, 152, 1014}; +std::vector n_trees = {1, 5, 17}; +std::vector max_features = {0.1f, 0.5f, 1.0f}; +std::vector max_samples = {0.1f, 0.5f, 1.0f}; +std::vector max_depth = {1, 10, 30}; +std::vector max_leaves = {-1}; // Bug for max_leaves, non-determinism as threads compete to + // place their nodes inside this limit +std::vector bootstrap = {false, true}; +std::vector n_bins = {2, 57, 128}; // Bug for n_bins > 128. Uses too much shared memory. std::vector min_samples_leaf = {1, 10, 30}; std::vector min_samples_split = {2, 10}; -std::vector min_impurity_decrease = {0.0f, 1.0f, 10.0f}; +std::vector min_impurity_decrease = {0.0, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; std::vector split_criterion = { CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; @@ -415,6 +392,7 @@ INSTANTIATE_TEST_CASE_P(RfTests, seed, n_labels, double_precision))); + struct QuantileTestParameters { int n_rows; int n_bins; @@ -535,4 +513,205 @@ typedef RFQuantileBinsLowerBoundTest RFQuantileBinsLowerBoundTestD; TEST_P(RFQuantileBinsLowerBoundTestD, test) {} INSTANTIATE_TEST_CASE_P(RfTests, RFQuantileBinsLowerBoundTestD, ::testing::ValuesIn(inputs)); +//------------------------------------------------------------------------------------------------------ + +namespace DT { + +struct ObjectiveTestParameters +{ + CRITERION criterion; + uint64_t seed; + int n_bins; + int n_classes; + double min_impurity_decrease; + int min_samples_leaf; + +}; + +template +class ObjectiveTest : public ::testing::TestWithParam +{ + typedef typename ObjectiveT::DataT DataT; + typedef typename ObjectiveT::LabelT LabelT; + typedef typename ObjectiveT::IdxT IdxT; + typedef typename ObjectiveT::BinT BinT; + + ObjectiveTestParameters params; + + public: + + auto _rand(int const end = 1000) + { + return rand() % end; + } + + auto _gen_hist_bins(){ + std::vector hist_bins(params.n_bins * params.n_classes); + for(auto c = 0; c < params.n_classes; ++c) + { + for(auto b = 0; b < params.n_bins; ++b) + { + // initializing hist_bins + BinT tmp = BinT(); + if constexpr(std::is_same::value) // classification type + { + tmp += BinT(_rand()); + hist_bins[c*params.n_bins + b] += tmp; // random pdf bin + hist_bins[c*params.n_bins + b] += ( b > 0 ? hist_bins[c*params.n_bins + b - 1] : BinT()); // pdf to cdf + } + else // regression type + { + tmp += BinT(static_cast(_rand()), _rand()); + hist_bins[c*params.n_bins + b] += tmp; // random pdf bin + hist_bins[c*params.n_bins + b] += ( b > 0 ? hist_bins[c*params.n_bins + b - 1] : BinT()); // pdf to cdf + } + } + } + return hist_bins; + } + + auto _poisson_ground_truth_gain(std::vector const & hist_bins, std::size_t split_bin_index) + { + + // compute the gain to be + DataT label_sum = hist_bins.back().label_sum; + IdxT len = hist_bins.back().count; + IdxT nLeft = hist_bins[split_bin_index].count; + DataT left_label_sum = hist_bins[split_bin_index].label_sum; + DataT right_label_sum= label_sum - left_label_sum; + IdxT nRight = len - nLeft; + DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); + DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); + auto gain = parent_obj - (left_obj + right_obj); + gain = gain / len; + + // edge cases + if (gain <= params.min_impurity_decrease || + nLeft < params.min_samples_leaf || + nRight < params.min_samples_leaf || + label_sum < EPS || + right_label_sum < EPS || + left_label_sum < EPS) + return -std::numeric_limits::max(); + else return gain; + + } + + auto _gini_ground_truth_gain(std::vector const & hist_bins, std::size_t const split_bin_index) + { + auto len = _get_nLeft(hist_bins, params.n_bins-1); + auto nLeft = _get_nLeft(hist_bins, split_bin_index); + auto nRight = len - nLeft; + constexpr DataT One = DataT(1.0); + auto invlen = One / len; + auto invLeft = One / nLeft; + auto invRight = One / nRight; + auto gain = DataT(0.0); + + for(IdxT c = 0; c < params.n_classes; ++c) + { + IdxT val_i = 0; + auto lval_i = hist_bins[params.n_bins * c + split_bin_index].x; + auto lval = DataT(lval_i); + gain += lval * invLeft * lval * invlen; + + val_i += lval_i; + auto total_sum = hist_bins[params.n_bins * c + params.n_bins - 1].x; + auto rval_i = total_sum - lval_i; + auto rval = DataT(rval_i); + gain += rval * invRight * rval * invlen; + + val_i += rval_i; + auto val = DataT(val_i) * invlen; + gain -= val * val; + } + + // edge cases + if (gain <= params.min_impurity_decrease || + nLeft < params.min_samples_leaf || + nRight < params.min_samples_leaf) + { + return -std::numeric_limits::max(); + } + else + { + return gain; + } + } + + auto _get_ground_truth_gain(std::vector const & hist_bins, std::size_t const split_bin_index) + { + if constexpr(std::is_same>::value) // poisson + { + return _poisson_ground_truth_gain(hist_bins, split_bin_index); + } + else if constexpr(std::is_same>::value) // gini + { + return _gini_ground_truth_gain(hist_bins, split_bin_index); + } + return (double)0.0; + } + + auto _get_nLeft(std::vector const & hist_bins, IdxT idx) + { + auto count {IdxT(0)}; + for (auto c = 0; c < params.n_classes; ++c) + { + if constexpr(std::is_same::value) // countbin + { + count += hist_bins[params.n_bins * c + idx].x; + } + else // aggregatebin + { + count += hist_bins[params.n_bins * c + idx].count; + } + } + return count; + } + + void SetUp() override + { + srand(params.seed); + params = ::testing::TestWithParam::GetParam(); + ObjectiveT objective(params.n_classes, params.min_impurity_decrease, params.min_samples_leaf); + + auto hist_bins = _gen_hist_bins(); + auto split_bin_index = _rand(params.n_bins); + auto ground_truth_gain = _get_ground_truth_gain(hist_bins, split_bin_index); + auto hypothesis_gain = objective.gain(&hist_bins[0], + split_bin_index, + params.n_bins, + _get_nLeft(hist_bins, params.n_bins-1), + _get_nLeft(hist_bins, split_bin_index)); + + ASSERT_EQ(ground_truth_gain, hypothesis_gain); + + } +}; + +const std::vector poisson_objective_test_parameters = { + {CRITERION::POISSON, 9507819643927052255LLU, 64, 1, 0.0001, 0}, + {CRITERION::POISSON, 9507819643927052256LLU, 128, 1, 0.0001, 1}, + {CRITERION::POISSON, 9507819643927052257LLU, 256, 1, 0.0001, 1}, + {CRITERION::POISSON, 9507819643927052258LLU, 512, 1, 0.0001, 5}, + }; +const std::vector gini_objective_test_parameters = { + {CRITERION::GINI, 9507819643927052255LLU, 64, 2, 0.0001, 0}, + {CRITERION::GINI, 9507819643927052256LLU, 128, 10, 0.0001, 1}, + {CRITERION::GINI, 9507819643927052257LLU, 256, 100, 0.0001, 1}, + {CRITERION::GINI, 9507819643927052258LLU, 512, 100, 0.0001, 5}, + }; + +// poisson objective test +typedef ObjectiveTest> PoissonObjectiveTestD; +TEST_P(PoissonObjectiveTestD, poissonObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, PoissonObjectiveTestD, ::testing::ValuesIn(poisson_objective_test_parameters)); + +// gini objective test +typedef ObjectiveTest> GiniObjectiveTestD; +TEST_P(GiniObjectiveTestD, giniObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, GiniObjectiveTestD, ::testing::ValuesIn(gini_objective_test_parameters)); + +} // end namespace DT } // end namespace ML From 925116d0f6f17ce0b7ae7b593f9a2ad047d3a1df Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 31 Aug 2021 22:28:47 +0530 Subject: [PATCH 11/42] FIX clang format --- cpp/test/sg/rf_test.cu | 55 ++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 69edeab73f..6641c585ff 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -251,6 +251,8 @@ class RfSpecialisedTest { void TestAccuracyImprovement() { if (params.max_depth <= 1) { return; } + // avereraging between models can introduce variance + if (params.n_trees > 1) { return; } // accuracy is not guaranteed to improve with bootstrapping if (params.bootstrap) { return; } raft::handle_t handle(params.n_streams); @@ -278,12 +280,36 @@ class RfSpecialisedTest { void TestTreeSize() { for (int i = 0u; i < forest->rf_params.n_trees; i++) { - EXPECT_LE(forest->trees[i].depth_counter, params.max_depth); + // Check we have actually built something, otherwise these tests can all pass when the tree + // algorithm produces only stumps + size_t effective_rows = params.n_rows * params.max_samples; + if (params.max_depth > 0 && params.min_impurity_decrease == 0 && effective_rows >= 100) { + EXPECT_GT(forest->trees[i].leaf_counter, 1); + } + + // Check number of leaves is accurate + int num_leaves = 0; + for (auto n : forest->trees[i].sparsetree) { + num_leaves += n.IsLeaf(); + } + EXPECT_EQ(num_leaves, forest->trees[i].leaf_counter); if (params.max_leaves > 0) { EXPECT_LE(forest->trees[i].leaf_counter, params.max_leaves); } + + EXPECT_LE(forest->trees[i].depth_counter, params.max_depth); EXPECT_LE(forest->trees[i].leaf_counter, raft::ceildiv(params.n_rows, params.min_samples_leaf)); } } + + void TestMinImpurity() + { + for (int i = 0u; i < forest->rf_params.n_trees; i++) { + for (auto n : forest->trees[i].sparsetree) { + if (!n.IsLeaf()) { EXPECT_GT(n.best_metric_val, params.min_impurity_decrease); } + } + } + } + void TestDeterminism() { // Regression models use floating point atomics, so are not bitwise reproducible @@ -309,9 +335,9 @@ class RfSpecialisedTest { void Test() { TestAccuracyImprovement(); - // Bugs - // TestDeterminism(); - // TestTreeSize(); + TestDeterminism(); + TestTreeSize(); + TestTreeSize(); } RF_metrics training_metrics; @@ -349,19 +375,18 @@ class RfTest : public ::testing::TestWithParam { TEST_P(RfTest, PropertyBasedTest) {} // Parameter ranges to test -std::vector n_rows = {10, 100, 1452}; -std::vector n_cols = {1, 5, 152, 1014}; -std::vector n_trees = {1, 5, 17}; -std::vector max_features = {0.1f, 0.5f, 1.0f}; -std::vector max_samples = {0.1f, 0.5f, 1.0f}; -std::vector max_depth = {1, 10, 30}; -std::vector max_leaves = {-1}; // Bug for max_leaves, non-determinism as threads compete to - // place their nodes inside this limit -std::vector bootstrap = {false, true}; -std::vector n_bins = {2, 57, 128}; // Bug for n_bins > 128. Uses too much shared memory. +std::vector n_rows = {10, 100, 1452}; +std::vector n_cols = {1, 5, 152, 1014}; +std::vector n_trees = {1, 5, 17}; +std::vector max_features = {0.1f, 0.5f, 1.0f}; +std::vector max_samples = {0.1f, 0.5f, 1.0f}; +std::vector max_depth = {1, 10, 30}; +std::vector max_leaves = {-1, 16, 50}; +std::vector bootstrap = {false, true}; +std::vector n_bins = {2, 57, 128, 256}; std::vector min_samples_leaf = {1, 10, 30}; std::vector min_samples_split = {2, 10}; -std::vector min_impurity_decrease = {0.0, 1.0f, 10.0f}; +std::vector min_impurity_decrease = {0.0f, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; std::vector split_criterion = { CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; From 3142caf596d425812b61992b739af13dafd11b3e Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 31 Aug 2021 22:30:32 +0530 Subject: [PATCH 12/42] FIX clang format --- .../batched-levelalgo/metrics.cuh | 134 +++++++-------- cpp/test/sg/rf_test.cu | 155 ++++++++---------- 2 files changed, 138 insertions(+), 151 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 19009b4234..c1c7a47bed 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -102,28 +102,29 @@ class GiniObjectiveFunction { DI IdxT NumClasses() const { return nclasses; } - HDI DataT gain(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - - auto nRight = len - nLeft; + HDI DataT gain(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + { + auto nRight = len - nLeft; constexpr DataT One = DataT(1.0); - auto invlen = One / len; - auto invLeft = One / nLeft; - auto invRight = One / nRight; - auto gain_ = DataT(0.0); + auto invlen = One / len; + auto invLeft = One / nLeft; + auto invRight = One / nRight; + auto gain_ = DataT(0.0); // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) + return -std::numeric_limits::max(); for (IdxT j = 0; j < nclasses; ++j) { - int val_i = 0; + int val_i = 0; auto lval_i = hist[nbins * j + i].x; - auto lval = DataT(lval_i); + auto lval = DataT(lval_i); gain_ += lval * invLeft * lval * invlen; val_i += lval_i; auto total_sum = hist[nbins * j + nbins - 1].x; - auto rval_i = total_sum - lval_i; - auto rval = DataT(rval_i); + auto rval_i = total_sum - lval_i; + auto rval = DataT(rval_i); gain_ += rval * invRight * rval * invlen; val_i += rval_i; @@ -132,12 +133,14 @@ class GiniObjectiveFunction { } // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + if (gain_ <= min_impurity_decrease) + return -std::numeric_limits::max(); - else return gain_; + else + return gain_; } - DI Split Gain(BinT * shist, DataT * sbins, IdxT col, IdxT len, IdxT nbins) + DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -150,7 +153,7 @@ class GiniObjectiveFunction { return sp; } - static DI LabelT LeafPrediction(BinT const * shist, int nclasses) + static DI LabelT LeafPrediction(BinT const* shist, int nclasses) { int class_idx = 0; int count = 0; @@ -185,20 +188,17 @@ class EntropyObjectiveFunction { } DI IdxT NumClasses() const { return nclasses; } - HDI DataT gain(BinT const * hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT gain(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - auto nRight {len - nLeft}; - auto gain_ {DataT(0.0)}; + auto nRight{len - nLeft}; + auto gain_{DataT(0.0)}; // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) - { + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { return -std::numeric_limits::max(); - } - else - { - auto invLeft {DataT(1.0) / nLeft}; - auto invRight {DataT(1.0) / nRight}; - auto invLen {DataT(1.0) / len}; + } else { + auto invLeft{DataT(1.0) / nLeft}; + auto invRight{DataT(1.0) / nRight}; + auto invLen{DataT(1.0) / len}; for (IdxT c = 0; c < nclasses; ++c) { int val_i = 0; auto lval_i = hist[nbins * c + i].x; @@ -222,10 +222,10 @@ class EntropyObjectiveFunction { } } - // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + // if the gain is not "enough", don't bother! + if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); - return gain_; + return gain_; } } @@ -233,17 +233,16 @@ class EntropyObjectiveFunction { { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - auto nLeft {IdxT(0)}; - for (IdxT j = 0; j < nclasses; ++j) - { + auto nLeft{IdxT(0)}; + for (IdxT j = 0; j < nclasses; ++j) { nLeft += scdf_labels[nbins * j + i].x; } - sp.update({sbins[i], col, gain(scdf_labels, i , nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, gain(scdf_labels, i, nbins, len, nLeft), nLeft}); } return sp; } - static DI LabelT LeafPrediction(BinT const * shist, int nclasses) + static DI LabelT LeafPrediction(BinT const* shist, int nclasses) { // Same as Gini return GiniObjectiveFunction::LeafPrediction(shist, nclasses); @@ -280,36 +279,40 @@ class PoissonObjectiveFunction { * of the impurity decrease for a given split. * The Gain is the difference in the proxy impurities of the parent and the * weighted sum of impurities of its children. - */ - HDI DataT gain(BinT const * hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - + */ + HDI DataT gain(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + { // get the lens' - auto nRight = len - nLeft; + auto nRight = len - nLeft; // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) + return -std::numeric_limits::max(); - auto label_sum = hist[nbins - 1].label_sum; - auto left_label_sum = (hist[i].label_sum); - auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + auto label_sum = hist[nbins - 1].label_sum; + auto left_label_sum = (hist[i].label_sum); + auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); // label sum cannot be non-positive - if (label_sum < EPS || left_label_sum < EPS || right_label_sum < EPS) return -std::numeric_limits::max(); + if (label_sum < EPS || left_label_sum < EPS || right_label_sum < EPS) + return -std::numeric_limits::max(); // compute the gain to be - DataT parent_obj = -label_sum * raft::myLog(label_sum / len); - DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); - DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - auto gain_ = parent_obj - (left_obj + right_obj); - gain_ = gain_ / len; + DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); + DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); + auto gain_ = parent_obj - (left_obj + right_obj); + gain_ = gain_ / len; // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + if (gain_ <= min_impurity_decrease) + return -std::numeric_limits::max(); - else return gain_; + else + return gain_; } - DI Split Gain(BinT const * shist, DataT const * sbins, IdxT col, IdxT len, IdxT nbins) + DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -319,7 +322,7 @@ class PoissonObjectiveFunction { return sp; } - static DI LabelT LeafPrediction(BinT const * shist, int nclasses) + static DI LabelT LeafPrediction(BinT const* shist, int nclasses) { return shist[0].label_sum / shist[0].count; } @@ -343,24 +346,21 @@ class MSEObjectiveFunction { } DI IdxT NumClasses() const { return 1; } - HDI DataT gain(BinT const * hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT gain(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - auto gain_ {DataT(0)}; - auto nRight {len - nLeft}; - auto invLen {DataT(1.0) / len}; + auto gain_{DataT(0)}; + auto nRight{len - nLeft}; + auto invLen{DataT(1.0) / len}; // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) - { + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { return -std::numeric_limits::max(); - } - else - { - auto label_sum = hist[nbins - 1].label_sum; + } else { + auto label_sum = hist[nbins - 1].label_sum; auto parent_obj = -label_sum * label_sum * invLen; auto left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; auto right_label_sum = hist[i].label_sum - label_sum; auto right_obj = -(right_label_sum * right_label_sum) / nRight; - gain_ = parent_obj - (left_obj + right_obj); + gain_ = parent_obj - (left_obj + right_obj); gain_ *= invLen; // if the gain is not "enough", don't bother! @@ -370,17 +370,17 @@ class MSEObjectiveFunction { } } - DI Split Gain(BinT const * shist, DataT const * sbins, IdxT col, IdxT len, IdxT nbins) + DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - auto nLeft = shist[i].count; + auto nLeft = shist[i].count; sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); } return sp; } - static DI LabelT LeafPrediction(BinT const * shist, int nclasses) + static DI LabelT LeafPrediction(BinT const* shist, int nclasses) { return shist[0].label_sum / shist[0].count; } diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 6641c585ff..c6dbe45fbb 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include @@ -532,20 +532,17 @@ INSTANTIATE_TEST_CASE_P(RfTests, RFQuantileBinsLowerBoundTestD, ::testing::Value namespace DT { -struct ObjectiveTestParameters -{ +struct ObjectiveTestParameters { CRITERION criterion; uint64_t seed; int n_bins; int n_classes; double min_impurity_decrease; int min_samples_leaf; - }; template -class ObjectiveTest : public ::testing::TestWithParam -{ +class ObjectiveTest : public ::testing::TestWithParam { typedef typename ObjectiveT::DataT DataT; typedef typename ObjectiveT::LabelT LabelT; typedef typename ObjectiveT::IdxT IdxT; @@ -554,69 +551,62 @@ class ObjectiveTest : public ::testing::TestWithParam ObjectiveTestParameters params; public: + auto _rand(int const end = 1000) { return rand() % end; } - auto _rand(int const end = 1000) - { - return rand() % end; - } - - auto _gen_hist_bins(){ + auto _gen_hist_bins() + { std::vector hist_bins(params.n_bins * params.n_classes); - for(auto c = 0; c < params.n_classes; ++c) - { - for(auto b = 0; b < params.n_bins; ++b) - { + for (auto c = 0; c < params.n_classes; ++c) { + for (auto b = 0; b < params.n_bins; ++b) { // initializing hist_bins BinT tmp = BinT(); - if constexpr(std::is_same::value) // classification type + if constexpr (std::is_same::value) // classification type { tmp += BinT(_rand()); - hist_bins[c*params.n_bins + b] += tmp; // random pdf bin - hist_bins[c*params.n_bins + b] += ( b > 0 ? hist_bins[c*params.n_bins + b - 1] : BinT()); // pdf to cdf - } - else // regression type + hist_bins[c * params.n_bins + b] += tmp; // random pdf bin + hist_bins[c * params.n_bins + b] += + (b > 0 ? hist_bins[c * params.n_bins + b - 1] : BinT()); // pdf to cdf + } else // regression type { tmp += BinT(static_cast(_rand()), _rand()); - hist_bins[c*params.n_bins + b] += tmp; // random pdf bin - hist_bins[c*params.n_bins + b] += ( b > 0 ? hist_bins[c*params.n_bins + b - 1] : BinT()); // pdf to cdf + hist_bins[c * params.n_bins + b] += tmp; // random pdf bin + hist_bins[c * params.n_bins + b] += + (b > 0 ? hist_bins[c * params.n_bins + b - 1] : BinT()); // pdf to cdf } } } return hist_bins; } - auto _poisson_ground_truth_gain(std::vector const & hist_bins, std::size_t split_bin_index) + auto _poisson_ground_truth_gain(std::vector const& hist_bins, std::size_t split_bin_index) { - // compute the gain to be - DataT label_sum = hist_bins.back().label_sum; - IdxT len = hist_bins.back().count; - IdxT nLeft = hist_bins[split_bin_index].count; - DataT left_label_sum = hist_bins[split_bin_index].label_sum; - DataT right_label_sum= label_sum - left_label_sum; - IdxT nRight = len - nLeft; - DataT parent_obj = -label_sum * raft::myLog(label_sum / len); - DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); - DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - auto gain = parent_obj - (left_obj + right_obj); - gain = gain / len; + DataT label_sum = hist_bins.back().label_sum; + IdxT len = hist_bins.back().count; + IdxT nLeft = hist_bins[split_bin_index].count; + DataT left_label_sum = hist_bins[split_bin_index].label_sum; + DataT right_label_sum = label_sum - left_label_sum; + IdxT nRight = len - nLeft; + DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); + DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); + auto gain = parent_obj - (left_obj + right_obj); + gain = gain / len; // edge cases - if (gain <= params.min_impurity_decrease || - nLeft < params.min_samples_leaf || - nRight < params.min_samples_leaf || - label_sum < EPS || - right_label_sum < EPS || + if (gain <= params.min_impurity_decrease || nLeft < params.min_samples_leaf || + nRight < params.min_samples_leaf || label_sum < EPS || right_label_sum < EPS || left_label_sum < EPS) return -std::numeric_limits::max(); - else return gain; - + else + return gain; } - auto _gini_ground_truth_gain(std::vector const & hist_bins, std::size_t const split_bin_index) + auto _gini_ground_truth_gain(std::vector const& hist_bins, + std::size_t const split_bin_index) { - auto len = _get_nLeft(hist_bins, params.n_bins-1); - auto nLeft = _get_nLeft(hist_bins, split_bin_index); + auto len = _get_nLeft(hist_bins, params.n_bins - 1); + auto nLeft = _get_nLeft(hist_bins, split_bin_index); auto nRight = len - nLeft; constexpr DataT One = DataT(1.0); auto invlen = One / len; @@ -624,8 +614,7 @@ class ObjectiveTest : public ::testing::TestWithParam auto invRight = One / nRight; auto gain = DataT(0.0); - for(IdxT c = 0; c < params.n_classes; ++c) - { + for (IdxT c = 0; c < params.n_classes; ++c) { IdxT val_i = 0; auto lval_i = hist_bins[params.n_bins * c + split_bin_index].x; auto lval = DataT(lval_i); @@ -643,41 +632,36 @@ class ObjectiveTest : public ::testing::TestWithParam } // edge cases - if (gain <= params.min_impurity_decrease || - nLeft < params.min_samples_leaf || - nRight < params.min_samples_leaf) - { + if (gain <= params.min_impurity_decrease || nLeft < params.min_samples_leaf || + nRight < params.min_samples_leaf) { return -std::numeric_limits::max(); - } - else - { + } else { return gain; } } - auto _get_ground_truth_gain(std::vector const & hist_bins, std::size_t const split_bin_index) + auto _get_ground_truth_gain(std::vector const& hist_bins, std::size_t const split_bin_index) { - if constexpr(std::is_same>::value) // poisson + if constexpr (std::is_same>::value) // poisson { return _poisson_ground_truth_gain(hist_bins, split_bin_index); - } - else if constexpr(std::is_same>::value) // gini + } else if constexpr (std::is_same>::value) // gini { return _gini_ground_truth_gain(hist_bins, split_bin_index); } return (double)0.0; } - auto _get_nLeft(std::vector const & hist_bins, IdxT idx) + auto _get_nLeft(std::vector const& hist_bins, IdxT idx) { - auto count {IdxT(0)}; - for (auto c = 0; c < params.n_classes; ++c) - { - if constexpr(std::is_same::value) // countbin + auto count{IdxT(0)}; + for (auto c = 0; c < params.n_classes; ++c) { + if constexpr (std::is_same::value) // countbin { count += hist_bins[params.n_bins * c + idx].x; - } - else // aggregatebin + } else // aggregatebin { count += hist_bins[params.n_bins * c + idx].count; } @@ -691,42 +675,45 @@ class ObjectiveTest : public ::testing::TestWithParam params = ::testing::TestWithParam::GetParam(); ObjectiveT objective(params.n_classes, params.min_impurity_decrease, params.min_samples_leaf); - auto hist_bins = _gen_hist_bins(); - auto split_bin_index = _rand(params.n_bins); + auto hist_bins = _gen_hist_bins(); + auto split_bin_index = _rand(params.n_bins); auto ground_truth_gain = _get_ground_truth_gain(hist_bins, split_bin_index); - auto hypothesis_gain = objective.gain(&hist_bins[0], + auto hypothesis_gain = objective.gain(&hist_bins[0], split_bin_index, params.n_bins, - _get_nLeft(hist_bins, params.n_bins-1), + _get_nLeft(hist_bins, params.n_bins - 1), _get_nLeft(hist_bins, split_bin_index)); ASSERT_EQ(ground_truth_gain, hypothesis_gain); - } }; const std::vector poisson_objective_test_parameters = { - {CRITERION::POISSON, 9507819643927052255LLU, 64, 1, 0.0001, 0}, - {CRITERION::POISSON, 9507819643927052256LLU, 128, 1, 0.0001, 1}, - {CRITERION::POISSON, 9507819643927052257LLU, 256, 1, 0.0001, 1}, - {CRITERION::POISSON, 9507819643927052258LLU, 512, 1, 0.0001, 5}, - }; + {CRITERION::POISSON, 9507819643927052255LLU, 64, 1, 0.0001, 0}, + {CRITERION::POISSON, 9507819643927052256LLU, 128, 1, 0.0001, 1}, + {CRITERION::POISSON, 9507819643927052257LLU, 256, 1, 0.0001, 1}, + {CRITERION::POISSON, 9507819643927052258LLU, 512, 1, 0.0001, 5}, +}; const std::vector gini_objective_test_parameters = { - {CRITERION::GINI, 9507819643927052255LLU, 64, 2, 0.0001, 0}, - {CRITERION::GINI, 9507819643927052256LLU, 128, 10, 0.0001, 1}, - {CRITERION::GINI, 9507819643927052257LLU, 256, 100, 0.0001, 1}, - {CRITERION::GINI, 9507819643927052258LLU, 512, 100, 0.0001, 5}, - }; + {CRITERION::GINI, 9507819643927052255LLU, 64, 2, 0.0001, 0}, + {CRITERION::GINI, 9507819643927052256LLU, 128, 10, 0.0001, 1}, + {CRITERION::GINI, 9507819643927052257LLU, 256, 100, 0.0001, 1}, + {CRITERION::GINI, 9507819643927052258LLU, 512, 100, 0.0001, 5}, +}; // poisson objective test typedef ObjectiveTest> PoissonObjectiveTestD; TEST_P(PoissonObjectiveTestD, poissonObjectiveTest) {} -INSTANTIATE_TEST_CASE_P(RfTests, PoissonObjectiveTestD, ::testing::ValuesIn(poisson_objective_test_parameters)); +INSTANTIATE_TEST_CASE_P(RfTests, + PoissonObjectiveTestD, + ::testing::ValuesIn(poisson_objective_test_parameters)); // gini objective test typedef ObjectiveTest> GiniObjectiveTestD; TEST_P(GiniObjectiveTestD, giniObjectiveTest) {} -INSTANTIATE_TEST_CASE_P(RfTests, GiniObjectiveTestD, ::testing::ValuesIn(gini_objective_test_parameters)); +INSTANTIATE_TEST_CASE_P(RfTests, + GiniObjectiveTestD, + ::testing::ValuesIn(gini_objective_test_parameters)); -} // end namespace DT +} // end namespace DT } // end namespace ML From 9676818db80dad48a07757789ee21e35672b765f Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 31 Aug 2021 22:38:50 +0530 Subject: [PATCH 13/42] remove debug code --- cpp/test/sg/rf_test.cu | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index c6dbe45fbb..0757ee2bd8 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include @@ -336,7 +335,7 @@ class RfSpecialisedTest { { TestAccuracyImprovement(); TestDeterminism(); - TestTreeSize(); + TestMinImpurity(); TestTreeSize(); } @@ -533,7 +532,6 @@ INSTANTIATE_TEST_CASE_P(RfTests, RFQuantileBinsLowerBoundTestD, ::testing::Value namespace DT { struct ObjectiveTestParameters { - CRITERION criterion; uint64_t seed; int n_bins; int n_classes; @@ -689,16 +687,16 @@ class ObjectiveTest : public ::testing::TestWithParam { }; const std::vector poisson_objective_test_parameters = { - {CRITERION::POISSON, 9507819643927052255LLU, 64, 1, 0.0001, 0}, - {CRITERION::POISSON, 9507819643927052256LLU, 128, 1, 0.0001, 1}, - {CRITERION::POISSON, 9507819643927052257LLU, 256, 1, 0.0001, 1}, - {CRITERION::POISSON, 9507819643927052258LLU, 512, 1, 0.0001, 5}, + {9507819643927052255LLU, 64, 1, 0.0001, 0}, + {9507819643927052256LLU, 128, 1, 0.0001, 1}, + {9507819643927052257LLU, 256, 1, 0.0001, 1}, + {9507819643927052258LLU, 512, 1, 0.0001, 5}, }; const std::vector gini_objective_test_parameters = { - {CRITERION::GINI, 9507819643927052255LLU, 64, 2, 0.0001, 0}, - {CRITERION::GINI, 9507819643927052256LLU, 128, 10, 0.0001, 1}, - {CRITERION::GINI, 9507819643927052257LLU, 256, 100, 0.0001, 1}, - {CRITERION::GINI, 9507819643927052258LLU, 512, 100, 0.0001, 5}, + {9507819643927052255LLU, 64, 2, 0.0001, 0}, + {9507819643927052256LLU, 128, 10, 0.0001, 1}, + {9507819643927052257LLU, 256, 100, 0.0001, 1}, + {9507819643927052258LLU, 512, 100, 0.0001, 5}, }; // poisson objective test From c52c29fd9bfdd64316eba3d38a5fb865b1a62f1c Mon Sep 17 00:00:00 2001 From: venkywonka Date: Thu, 2 Sep 2021 22:47:55 +0530 Subject: [PATCH 14/42] address review comments --- .../batched-levelalgo/metrics.cuh | 71 +++---- cpp/test/sg/rf_test.cu | 197 ++++++++++-------- 2 files changed, 145 insertions(+), 123 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index c1c7a47bed..952101a6d7 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -27,12 +27,11 @@ namespace ML { namespace DT { -#define EPS 10 * std::numeric_limits::epsilon() - struct CountBin { int x; - HDI CountBin() : x(0) {} + CountBin(CountBin const&) = default; HDI CountBin(int x_) : x(x_) {} + HDI CountBin() : x(0){}; DI static void IncrementHistogram(CountBin* hist, int nbins, int b, int label) { @@ -56,7 +55,8 @@ struct AggregateBin { double label_sum; int count; - HDI AggregateBin() : label_sum(0.0), count(0) {} + AggregateBin(AggregateBin const&) = default; + HDI AggregateBin() : label_sum(0.0), count(0){}; HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {} DI static void IncrementHistogram(AggregateBin* hist, int nbins, int b, double label) @@ -102,14 +102,14 @@ class GiniObjectiveFunction { DI IdxT NumClasses() const { return nclasses; } - HDI DataT gain(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT GainPerSplit(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { auto nRight = len - nLeft; constexpr DataT One = DataT(1.0); auto invlen = One / len; auto invLeft = One / nLeft; auto invRight = One / nRight; - auto gain_ = DataT(0.0); + auto gain = DataT(0.0); // if there aren't enough samples in this split, don't bother! if (nLeft < min_samples_leaf || nRight < min_samples_leaf) @@ -119,25 +119,25 @@ class GiniObjectiveFunction { int val_i = 0; auto lval_i = hist[nbins * j + i].x; auto lval = DataT(lval_i); - gain_ += lval * invLeft * lval * invlen; + gain += lval * invLeft * lval * invlen; val_i += lval_i; auto total_sum = hist[nbins * j + nbins - 1].x; auto rval_i = total_sum - lval_i; auto rval = DataT(rval_i); - gain_ += rval * invRight * rval * invlen; + gain += rval * invRight * rval * invlen; val_i += rval_i; auto val = DataT(val_i) * invlen; - gain_ -= val * val; + gain -= val * val; } // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) + if (gain <= min_impurity_decrease) return -std::numeric_limits::max(); else - return gain_; + return gain; } DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) @@ -148,7 +148,7 @@ class GiniObjectiveFunction { for (IdxT j = 0; j < nclasses; ++j) { nLeft += shist[nbins * j + i].x; } - sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } @@ -188,10 +188,10 @@ class EntropyObjectiveFunction { } DI IdxT NumClasses() const { return nclasses; } - HDI DataT gain(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { auto nRight{len - nLeft}; - auto gain_{DataT(0.0)}; + auto gain{DataT(0.0)}; // if there aren't enough samples in this split, don't bother! if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { return -std::numeric_limits::max(); @@ -204,7 +204,7 @@ class EntropyObjectiveFunction { auto lval_i = hist[nbins * c + i].x; if (lval_i != 0) { auto lval = DataT(lval_i); - gain_ += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invLen; + gain += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invLen; } val_i += lval_i; @@ -212,20 +212,20 @@ class EntropyObjectiveFunction { auto rval_i = total_sum - lval_i; if (rval_i != 0) { auto rval = DataT(rval_i); - gain_ += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invLen; + gain += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invLen; } val_i += rval_i; if (val_i != 0) { auto val = DataT(val_i) * invLen; - gain_ -= val * raft::myLog(val) / raft::myLog(DataT(2)); + gain -= val * raft::myLog(val) / raft::myLog(DataT(2)); } } // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + if (gain <= min_impurity_decrease) return -std::numeric_limits::max(); - return gain_; + return gain; } } @@ -237,7 +237,7 @@ class EntropyObjectiveFunction { for (IdxT j = 0; j < nclasses; ++j) { nLeft += scdf_labels[nbins * j + i].x; } - sp.update({sbins[i], col, gain(scdf_labels, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(scdf_labels, i, nbins, len, nLeft), nLeft}); } return sp; } @@ -261,7 +261,8 @@ class PoissonObjectiveFunction { IdxT min_samples_leaf; public: - using BinT = AggregateBin; + using BinT = AggregateBin; + static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); HDI PoissonObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) : min_impurity_decrease(min_impurity_decrease), min_samples_leaf(min_samples_leaf) @@ -280,7 +281,7 @@ class PoissonObjectiveFunction { * The Gain is the difference in the proxy impurities of the parent and the * weighted sum of impurities of its children. */ - HDI DataT gain(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { // get the lens' auto nRight = len - nLeft; @@ -294,22 +295,22 @@ class PoissonObjectiveFunction { auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); // label sum cannot be non-positive - if (label_sum < EPS || left_label_sum < EPS || right_label_sum < EPS) + if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) return -std::numeric_limits::max(); // compute the gain to be DataT parent_obj = -label_sum * raft::myLog(label_sum / len); DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - auto gain_ = parent_obj - (left_obj + right_obj); - gain_ = gain_ / len; + auto gain = parent_obj - (left_obj + right_obj); + gain = gain / len; // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) + if (gain <= min_impurity_decrease) return -std::numeric_limits::max(); else - return gain_; + return gain; } DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) @@ -317,7 +318,7 @@ class PoissonObjectiveFunction { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } @@ -346,9 +347,9 @@ class MSEObjectiveFunction { } DI IdxT NumClasses() const { return 1; } - HDI DataT gain(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - auto gain_{DataT(0)}; + auto gain{DataT(0)}; auto nRight{len - nLeft}; auto invLen{DataT(1.0) / len}; // if there aren't enough samples in this split, don't bother! @@ -360,13 +361,13 @@ class MSEObjectiveFunction { auto left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; auto right_label_sum = hist[i].label_sum - label_sum; auto right_obj = -(right_label_sum * right_label_sum) / nRight; - gain_ = parent_obj - (left_obj + right_obj); - gain_ *= invLen; + gain = parent_obj - (left_obj + right_obj); + gain *= invLen; // if the gain is not "enough", don't bother! - if (gain_ <= min_impurity_decrease) return -std::numeric_limits::max(); + if (gain <= min_impurity_decrease) return -std::numeric_limits::max(); - return gain_; + return gain; } } @@ -375,7 +376,7 @@ class MSEObjectiveFunction { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - sp.update({sbins[i], col, gain(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 0757ee2bd8..d38aacf06b 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -537,6 +537,7 @@ struct ObjectiveTestParameters { int n_classes; double min_impurity_decrease; int min_samples_leaf; + double tolerance; }; template @@ -549,119 +550,137 @@ class ObjectiveTest : public ::testing::TestWithParam { ObjectiveTestParameters params; public: - auto _rand(int const end = 1000) { return rand() % end; } + auto RandUnder(int const end = 10000) { return rand() % end; } - auto _gen_hist_bins() + auto GenHist() { - std::vector hist_bins(params.n_bins * params.n_classes); + std::vector cdf_hist, pdf_hist; + for (auto c = 0; c < params.n_classes; ++c) { for (auto b = 0; b < params.n_bins; ++b) { - // initializing hist_bins - BinT tmp = BinT(); - if constexpr (std::is_same::value) // classification type - { - tmp += BinT(_rand()); - hist_bins[c * params.n_bins + b] += tmp; // random pdf bin - hist_bins[c * params.n_bins + b] += - (b > 0 ? hist_bins[c * params.n_bins + b - 1] : BinT()); // pdf to cdf - } else // regression type - { - tmp += BinT(static_cast(_rand()), _rand()); - hist_bins[c * params.n_bins + b] += tmp; // random pdf bin - hist_bins[c * params.n_bins + b] += - (b > 0 ? hist_bins[c * params.n_bins + b - 1] : BinT()); // pdf to cdf - } + if constexpr (std::is_same::value) + pdf_hist.emplace_back(RandUnder()); + else + pdf_hist.emplace_back(static_cast(RandUnder()), RandUnder()); + + auto cumulative = b > 0 ? cdf_hist.back() : BinT(); + + cdf_hist.emplace_back(pdf_hist.empty() ? BinT() : pdf_hist.back()); + + cdf_hist.back() += cumulative; } } - return hist_bins; + + return std::make_pair(cdf_hist, pdf_hist); } - auto _poisson_ground_truth_gain(std::vector const& hist_bins, std::size_t split_bin_index) + auto PoissonHalfDeviance( + std::vector const& hist) // 1/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) { - // compute the gain to be - DataT label_sum = hist_bins.back().label_sum; - IdxT len = hist_bins.back().count; - IdxT nLeft = hist_bins[split_bin_index].count; - DataT left_label_sum = hist_bins[split_bin_index].label_sum; - DataT right_label_sum = label_sum - left_label_sum; - IdxT nRight = len - nLeft; - DataT parent_obj = -label_sum * raft::myLog(label_sum / len); - DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); - DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - auto gain = parent_obj - (left_obj + right_obj); - gain = gain / len; + BinT aggregate{BinT()}; + aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); + assert(aggregate.count > 0); + auto const y_mean = aggregate.label_sum / aggregate.count; + auto poisson_half_deviance{DataT(0.0)}; + + std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { + auto log_y = raft::myLog(h.label_sum ? h.label_sum : DataT(1.0)); // we don't want nans + poisson_half_deviance += h.label_sum * (log_y - raft::myLog(y_mean)) + y_mean - h.label_sum; + }); + + poisson_half_deviance /= aggregate.count; + return std::make_tuple( + poisson_half_deviance, aggregate.label_sum, static_cast(aggregate.count)); + } + + auto PoissonGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) + { + std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; + std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; + + auto [parent_phd, label_sum, n] = PoissonHalfDeviance(pdf_hist); + auto [left_phd, label_sum_left, n_left] = PoissonHalfDeviance(left_pdf_hist); + auto [right_phd, label_sum_right, n_right] = PoissonHalfDeviance(right_pdf_hist); + + auto gain = parent_phd - ((n_left / n) * left_phd + + (n_right / n) * right_phd); // gain in long form without proxy // edge cases - if (gain <= params.min_impurity_decrease || nLeft < params.min_samples_leaf || - nRight < params.min_samples_leaf || label_sum < EPS || right_label_sum < EPS || - left_label_sum < EPS) + if (gain <= params.min_impurity_decrease or n_left < params.min_samples_leaf or + n_right < params.min_samples_leaf or label_sum < ObjectiveT::eps_ or + label_sum_right < ObjectiveT::eps_ or label_sum_left < ObjectiveT::eps_) return -std::numeric_limits::max(); else return gain; } - auto _gini_ground_truth_gain(std::vector const& hist_bins, - std::size_t const split_bin_index) + auto GiniImpurity(std::vector const& hist) + { // sum((n_c/n_total)(1-(n_c/n_total))) + auto gini{double(0)}; + auto n_bins = hist.size() / params.n_classes; + auto n_instances = std::accumulate(hist.begin(), hist.end(), BinT()).x; // total instances + for (auto c = 0; c < params.n_classes; ++c) { + auto begin_iter = hist.begin() + c * n_bins; + auto end_iter = hist.begin() + (c + 1) * n_bins; + double class_proba = std::accumulate(begin_iter, end_iter, BinT()).x; // instances of class c + class_proba /= n_instances; // probability of class c + gini += class_proba * (1 - class_proba); // adding gain + } + return std::make_pair(gini, double(n_instances)); + } + + auto GiniGroundTruthGain(std::vector const& pdf_hist, std::size_t const split_bin_index) { - auto len = _get_nLeft(hist_bins, params.n_bins - 1); - auto nLeft = _get_nLeft(hist_bins, split_bin_index); - auto nRight = len - nLeft; - constexpr DataT One = DataT(1.0); - auto invlen = One / len; - auto invLeft = One / nLeft; - auto invRight = One / nRight; - auto gain = DataT(0.0); - - for (IdxT c = 0; c < params.n_classes; ++c) { - IdxT val_i = 0; - auto lval_i = hist_bins[params.n_bins * c + split_bin_index].x; - auto lval = DataT(lval_i); - gain += lval * invLeft * lval * invlen; - - val_i += lval_i; - auto total_sum = hist_bins[params.n_bins * c + params.n_bins - 1].x; - auto rval_i = total_sum - lval_i; - auto rval = DataT(rval_i); - gain += rval * invRight * rval * invlen; - - val_i += rval_i; - auto val = DataT(val_i) * invlen; - gain -= val * val; + std::vector left_pdf_hist, right_pdf_hist; + + for (auto c = 0; c < params.n_classes; ++c) { // decompose the pdf_hist + auto start = pdf_hist.begin() + c * params.n_bins; + auto split = pdf_hist.begin() + c * params.n_bins + split_bin_index + 1; + auto end = pdf_hist.begin() + (c + 1) * params.n_bins; + + left_pdf_hist.insert(left_pdf_hist.end(), start, split); + right_pdf_hist.insert(right_pdf_hist.end(), split, end); } + auto [parent_gini, n] = GiniImpurity(pdf_hist); + auto [left_gini, left_n] = GiniImpurity(left_pdf_hist); + auto [right_gini, right_n] = GiniImpurity(right_pdf_hist); + + auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); + // edge cases - if (gain <= params.min_impurity_decrease || nLeft < params.min_samples_leaf || - nRight < params.min_samples_leaf) { + if (gain <= params.min_impurity_decrease || left_n < params.min_samples_leaf || + right_n < params.min_samples_leaf) { return -std::numeric_limits::max(); } else { return gain; } } - auto _get_ground_truth_gain(std::vector const& hist_bins, std::size_t const split_bin_index) + auto GroundTruthGain(std::vector const& pdf_hist, std::size_t const split_bin_index) { if constexpr (std::is_same>::value) // poisson { - return _poisson_ground_truth_gain(hist_bins, split_bin_index); + return PoissonGroundTruthGain(pdf_hist, split_bin_index); } else if constexpr (std::is_same>::value) // gini { - return _gini_ground_truth_gain(hist_bins, split_bin_index); + return GiniGroundTruthGain(pdf_hist, split_bin_index); } - return (double)0.0; + return double(0.0); } - auto _get_nLeft(std::vector const& hist_bins, IdxT idx) + auto NumLeftOfBin(std::vector const& cdf_hist, IdxT idx) { auto count{IdxT(0)}; for (auto c = 0; c < params.n_classes; ++c) { if constexpr (std::is_same::value) // countbin { - count += hist_bins[params.n_bins * c + idx].x; + count += cdf_hist[params.n_bins * c + idx].x; } else // aggregatebin { - count += hist_bins[params.n_bins * c + idx].count; + count += cdf_hist[params.n_bins * c + idx].count; } } return count; @@ -673,30 +692,32 @@ class ObjectiveTest : public ::testing::TestWithParam { params = ::testing::TestWithParam::GetParam(); ObjectiveT objective(params.n_classes, params.min_impurity_decrease, params.min_samples_leaf); - auto hist_bins = _gen_hist_bins(); - auto split_bin_index = _rand(params.n_bins); - auto ground_truth_gain = _get_ground_truth_gain(hist_bins, split_bin_index); - auto hypothesis_gain = objective.gain(&hist_bins[0], - split_bin_index, - params.n_bins, - _get_nLeft(hist_bins, params.n_bins - 1), - _get_nLeft(hist_bins, split_bin_index)); + auto [cdf_hist, pdf_hist] = GenHist(); + + auto split_bin_index = RandUnder(params.n_bins); + auto ground_truth_gain = GroundTruthGain(pdf_hist, split_bin_index); + + auto hypothesis_gain = objective.GainPerSplit(&cdf_hist[0], + split_bin_index, + params.n_bins, + NumLeftOfBin(cdf_hist, params.n_bins - 1), + NumLeftOfBin(cdf_hist, split_bin_index)); - ASSERT_EQ(ground_truth_gain, hypothesis_gain); + ASSERT_NEAR(ground_truth_gain, hypothesis_gain, params.tolerance); } }; const std::vector poisson_objective_test_parameters = { - {9507819643927052255LLU, 64, 1, 0.0001, 0}, - {9507819643927052256LLU, 128, 1, 0.0001, 1}, - {9507819643927052257LLU, 256, 1, 0.0001, 1}, - {9507819643927052258LLU, 512, 1, 0.0001, 5}, + {9507819643927052255LLU, 64, 1, 0.0001, 0, 0.000001}, + {9507819643927052259LLU, 128, 1, 0.0001, 1, 0.000001}, + {9507819643927052251LLU, 256, 1, 0.0001, 1, 0.000001}, + {9507819643927052258LLU, 512, 1, 0.0001, 5, 0.000001}, }; const std::vector gini_objective_test_parameters = { - {9507819643927052255LLU, 64, 2, 0.0001, 0}, - {9507819643927052256LLU, 128, 10, 0.0001, 1}, - {9507819643927052257LLU, 256, 100, 0.0001, 1}, - {9507819643927052258LLU, 512, 100, 0.0001, 5}, + {9507819643927052255LLU, 64, 2, 0.0001, 0, 0.000001}, + {9507819643927052256LLU, 128, 10, 0.0001, 1, 0.000001}, + {9507819643927052257LLU, 256, 100, 0.0001, 1, 0.000001}, + {9507819643927052258LLU, 512, 100, 0.0001, 5, 0.000001}, }; // poisson objective test @@ -707,7 +728,7 @@ INSTANTIATE_TEST_CASE_P(RfTests, ::testing::ValuesIn(poisson_objective_test_parameters)); // gini objective test -typedef ObjectiveTest> GiniObjectiveTestD; +typedef ObjectiveTest> GiniObjectiveTestD; TEST_P(GiniObjectiveTestD, giniObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, GiniObjectiveTestD, From 79f00b8e69c7409339969738da08acb0774c2585 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Sat, 11 Sep 2021 22:28:43 +0530 Subject: [PATCH 15/42] add python level test --- python/cuml/test/test_random_forest.py | 37 +++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 46e7d572cd..6b0abda4ac 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -32,7 +32,7 @@ from sklearn.ensemble import RandomForestClassifier as skrfc from sklearn.ensemble import RandomForestRegressor as skrfr -from sklearn.metrics import accuracy_score, mean_squared_error +from sklearn.metrics import accuracy_score, mean_squared_error, mean_poisson_deviance from sklearn.datasets import fetch_california_housing, \ make_classification, make_regression, load_iris, load_breast_cancer, \ load_boston @@ -186,6 +186,41 @@ def special_reg(request): ) return X, y +@pytest.mark.parametrize("lam", [0.001, 0.01, 0.1]) +@pytest.mark.parametrize("max_depth", [2, 4, 7, 10, 25, 50]) +def test_poisson_convergence(lam, max_depth): + np.random.seed(33) + bootstrap = None + max_features = 1.0 + n_estimators = 1 + min_impurity_decrease = 1e-5 + n_datapoints = 100000 + # generating random poisson dataset + X = np.random.random((n_datapoints, 4)).astype(np.float32) + y = np.random.poisson(lam=lam, size=n_datapoints).astype(np.float32) + + poisson_preds = curfr( + split_criterion=4, + max_depth=max_depth, + n_estimators=n_estimators, + bootstrap=bootstrap, + max_features=max_features, + min_impurity_decrease=min_impurity_decrease).fit(X, y).predict(X) + mse_preds = curfr( + split_criterion=2, + max_depth=max_depth, + n_estimators=n_estimators, + bootstrap=bootstrap, + max_features=max_features, + min_impurity_decrease=min_impurity_decrease).fit(X, y).predict(X) + + mask = mse_preds > 0 # y should not be non-positive for mean_poisson_deviance + mse_mpd = mean_poisson_deviance(y[mask], mse_preds[mask]) + poisson_mpd = mean_poisson_deviance(y, poisson_preds) + + # model trained on poisson data with poisson criterion must perform better on poisson loss + assert mse_mpd >= poisson_mpd + @pytest.mark.parametrize( "max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)] From 13c3386e15822132a6735ff5e2a4398dde8028e4 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 13 Sep 2021 06:32:11 +0530 Subject: [PATCH 16/42] FIX clang format --- cpp/src/decisiontree/decisiontree.cuh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh index f00c8fd52f..44a907379f 100644 --- a/cpp/src/decisiontree/decisiontree.cuh +++ b/cpp/src/decisiontree/decisiontree.cuh @@ -292,16 +292,16 @@ class DecisionTree { .train(); } else if (params.split_criterion == CRITERION::POISSON) { return Builder>(handle, - treeid, - seed, - params, - data, - labels, - nrows, - ncols, - rowids, - unique_labels, - quantiles) + treeid, + seed, + params, + data, + labels, + nrows, + ncols, + rowids, + unique_labels, + quantiles) .train(); } else { ASSERT(false, "Unknown split criterion."); From 0332cc669e1f5e39788c87782025a31465975932 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 13 Sep 2021 11:41:33 +0530 Subject: [PATCH 17/42] flake fix, reduce test load --- python/cuml/test/test_random_forest.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 6b0abda4ac..f0e5c835d3 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -32,7 +32,8 @@ from sklearn.ensemble import RandomForestClassifier as skrfc from sklearn.ensemble import RandomForestRegressor as skrfr -from sklearn.metrics import accuracy_score, mean_squared_error, mean_poisson_deviance +from sklearn.metrics import accuracy_score, mean_squared_error, \ + mean_poisson_deviance from sklearn.datasets import fetch_california_housing, \ make_classification, make_regression, load_iris, load_breast_cancer, \ load_boston @@ -186,8 +187,9 @@ def special_reg(request): ) return X, y -@pytest.mark.parametrize("lam", [0.001, 0.01, 0.1]) -@pytest.mark.parametrize("max_depth", [2, 4, 7, 10, 25, 50]) + +@pytest.mark.parametrize("lam", [0.01, 0.1]) +@pytest.mark.parametrize("max_depth", [2, 4]) def test_poisson_convergence(lam, max_depth): np.random.seed(33) bootstrap = None @@ -213,12 +215,13 @@ def test_poisson_convergence(lam, max_depth): bootstrap=bootstrap, max_features=max_features, min_impurity_decrease=min_impurity_decrease).fit(X, y).predict(X) - - mask = mse_preds > 0 # y should not be non-positive for mean_poisson_deviance + # y should not be non-positive for mean_poisson_deviance + mask = mse_preds > 0 mse_mpd = mean_poisson_deviance(y[mask], mse_preds[mask]) poisson_mpd = mean_poisson_deviance(y, poisson_preds) - # model trained on poisson data with poisson criterion must perform better on poisson loss + # model trained on poisson data with + # poisson criterion must perform better on poisson loss assert mse_mpd >= poisson_mpd From 0a5d52ab8a97f770aebe08c002dccaada350be00 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 13 Sep 2021 16:44:32 +0530 Subject: [PATCH 18/42] fix tests, remove artifacts --- build | 1 - cpp/test/sg/rf_test.cu | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) delete mode 120000 build diff --git a/build b/build deleted file mode 120000 index 3e647c0fb6..0000000000 --- a/build +++ /dev/null @@ -1 +0,0 @@ -/home/gvenkatarama/cuml-builds/poisson/ \ No newline at end of file diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 21ca039f54..7f6a49e16c 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -327,7 +327,8 @@ class RfSpecialisedTest { void TestDeterminism() { // Regression models use floating point atomics, so are not bitwise reproducible - bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE; + bool is_regression = params.split_criterion == MSE or params.split_criterion == MAE or + params.split_criterion == POISSON; if (is_regression) return; // Repeat training @@ -394,7 +395,7 @@ class RfTest : public ::testing::TestWithParam { void SetUp() override { RfTestParams params = ::testing::TestWithParam::GetParam(); - bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE || + bool is_regression = params.split_criterion == MSE or params.split_criterion == MAE or params.split_criterion == POISSON; if (params.double_precision) { if (is_regression) { @@ -529,7 +530,7 @@ class RFQuantileTest : public ::testing::TestWithParam { int min_items_per_bin = max_items_per_bin - 1; int total_items = 0; for (int b = 0; b < params.n_bins; b++) { - ASSERT_TRUE(h_histogram[b] == max_items_per_bin || h_histogram[b] == min_items_per_bin) + ASSERT_TRUE(h_histogram[b] == max_items_per_bin or h_histogram[b] == min_items_per_bin) << "No. samples in bin[" << b << "] = " << h_histogram[b] << " Expected " << max_items_per_bin << " or " << min_items_per_bin << std::endl; total_items += h_histogram[b]; @@ -689,7 +690,7 @@ class ObjectiveTest : public ::testing::TestWithParam { auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); // edge cases - if (gain <= params.min_impurity_decrease || left_n < params.min_samples_leaf || + if (gain <= params.min_impurity_decrease or left_n < params.min_samples_leaf or right_n < params.min_samples_leaf) { return -std::numeric_limits::max(); } else { From 959ee2c6c6c9521bbed17702714fa1fe3c1e53cf Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 13 Sep 2021 16:50:23 +0530 Subject: [PATCH 19/42] purge artifacts --- docs/source/checkpoint.tl | Bin 7684 -> 0 bytes docs/source/kmeans_model.pkl | Bin 1817 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/source/checkpoint.tl delete mode 100644 docs/source/kmeans_model.pkl diff --git a/docs/source/checkpoint.tl b/docs/source/checkpoint.tl deleted file mode 100644 index 79f1d38a910f88e85df79949b5cf160d48e5f12b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7684 zcmeHLU2fDc5KdE|nE;(kRTFL;kkXwDUf!>0}>ZN;*|#^Ub6?_0$hLt zz*I9QNhitFxDH-@kryhR(gD zW`bqk?F-u2pi+wLQ8Sx9@5a$TrUPuE(4U@Eir09oJBe=osS7>9`EQHD9aU=P?Z>G8 zk?0;vku@agFKhp{=|9%Jky_94LhKssBj>1h+e~IZjRuE8pDM4;-O@md&4*#W;Nq7)<#`K%aw8&)atA z@)h3iV!p?A_L1uuKo|c-e1!c2jd>^ztG51UVS63!%M(&_?NzaYvroyjiIr4rF@9qH zeaMCPIZyE6R&f>26Rr!l;+IWPe0L%L1cZW7pA^Dng9R* diff --git a/docs/source/kmeans_model.pkl b/docs/source/kmeans_model.pkl deleted file mode 100644 index 469082f65354dabc737c236582d6e88b7b2f4cc2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1817 zcmah~Yitx%6yELbc3ZlA(6ud;3az{!-2gUFDYADcp~%oRR8tX!advmw8M`~XnMdo2 zh4Lu0&8>;_@&giiSlePD1}X}n+PNbSYZO!v2qH8#-~$yA6AKc>JG*Tm!NmD7nK}2| zbI$#~@0?p_-IiiEMIV^yQ-cAQFQCc_C%LME94pF9uoA%py43hCEkP}4>Uva*ra*f& zCwW6MhiEHI3aLt1RTw1_<`A^{)NlmBBt{S)GCCAis^k>LA9eM>6p^XmSVfgMnYc5E z9t%f^%wPpFqHWM2v7$c|WMqX^NHQ}_2(qs*0x1xc&M-rZ5(=OaclMnApL2@yVZ2Df z2ujpX5IIQ^SO%5Ej%PAKJ6R?>NggwFhR6wIHW&^FVkJ|-`jn7_O2D8m%rMeKmq}@1 zvd7CRJ|2~t>Yz260;bD8Ho$rVTz*x46sqadkx_sYs*zD?gVXr-g#zRhV_`f`P7xsB zWJMvC*b2qv3k8EA(KWOZza-`oMVK~up`8Th0y2YOPK=kBKPyRWgzRf1qj+?o+a9(ZS>g9;Rq3e#JPeh z!8Xb*f(}OFhL2?+Xp1_MY@$Er(t>DHJt{VP(G~*b6xvKO)FXa5Ur4;;)LE#pA+}(F z=GTzN<5_x-Jkb|zQjoWC;fgqd^0Gp_{(3X$ZfnLJ<)gu@MVH-IZjHmvo;Li>{_~oc z8>iRqoCf}}HsJ~FU0`a}VZ3r=pSEX9hCaS)jyrp2GkE=ldAOV2i4R}OrdsuQs%U>N zUbx{iTv2`3-BVi+b{$ODo8M~z+dCc9F3Y!|zbysq+jIqY%+1kLoM-T%AFlN5oplT} zl!d|jGiq`2m=j<@`VH;3l09H?-#2*Poqg`v+f%7=3G1}C-p;10&0$v^L2Sw?>;H;-Q@!jKnxLNxfC#dH@&5+}r^R$2~#Lq|5^C25Ud`O^is`)DPYpHz<@ z|DXcrojM67r(VV>Oru79d=#I~Oa*-d)%eu93)+U&ojAAp7O-3^0-J&)c2lqW>%SV^ z?MDzUbB@-v$>%}lAI%_~?*N_8O$5r!QEpJ!3wky>^j%}l_S_!vDAkgD3q0eH_d z_{rS|a9z<3psmjXnZNgf9s8Wr6v;-hThgdWsec0ItF_v_YrS|``n&D}u09-D(u4JD z4er)smJf@W0#IIa-Mu!;s_$J|48FKw*Za{Js^HQ9F2DIAh|FAr*`|$n;M_FencMCb zxO#Wl?GEtWq_f)Ww&Os~yNn}GWb241j5T_k%oaNnW+gTlZA#uj+xQIH#HWza&8OZU z+ljHuL{K8ebz+q{&Tow9_#j8R(MPn-r_pBKd4D=5N2l@7e2z}#)R@T_VNyu&lMXh) g{PQr5OoRveWBz~ihw63x3kR3?;{X5v From 5a5410e78882a492cb79ea52fbc916ae6b2d461e Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 13 Sep 2021 17:39:41 +0530 Subject: [PATCH 20/42] decrease tolerance --- cpp/test/sg/rf_test.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 7f6a49e16c..bee44777a6 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -749,16 +749,16 @@ class ObjectiveTest : public ::testing::TestWithParam { }; const std::vector poisson_objective_test_parameters = { - {9507819643927052255LLU, 64, 1, 0.0001, 0, 0.000001}, - {9507819643927052259LLU, 128, 1, 0.0001, 1, 0.000001}, - {9507819643927052251LLU, 256, 1, 0.0001, 1, 0.000001}, - {9507819643927052258LLU, 512, 1, 0.0001, 5, 0.000001}, + {9507819643927052255LLU, 64, 1, 0.0001, 0, 0.00001}, + {9507819643927052259LLU, 128, 1, 0.0001, 1, 0.00001}, + {9507819643927052251LLU, 256, 1, 0.0001, 1, 0.00001}, + {9507819643927052258LLU, 512, 1, 0.0001, 5, 0.00001}, }; const std::vector gini_objective_test_parameters = { - {9507819643927052255LLU, 64, 2, 0.0001, 0, 0.000001}, - {9507819643927052256LLU, 128, 10, 0.0001, 1, 0.000001}, - {9507819643927052257LLU, 256, 100, 0.0001, 1, 0.000001}, - {9507819643927052258LLU, 512, 100, 0.0001, 5, 0.000001}, + {9507819643927052255LLU, 64, 2, 0.0001, 0, 0.00001}, + {9507819643927052256LLU, 128, 10, 0.0001, 1, 0.00001}, + {9507819643927052257LLU, 256, 100, 0.0001, 1, 0.00001}, + {9507819643927052258LLU, 512, 100, 0.0001, 5, 0.00001}, }; // poisson objective test From 59caf115144d046a4a1f688b686bb2377f45ee6a Mon Sep 17 00:00:00 2001 From: venkywonka Date: Thu, 16 Sep 2021 18:00:15 +0530 Subject: [PATCH 21/42] remove min_impurity_decrease member --- .../batched-levelalgo/builder.cuh | 4 +- .../batched-levelalgo/metrics.cuh | 50 +++++-------------- .../sg/decisiontree_batchedlevel_unittest.cu | 2 +- cpp/test/sg/rf_test.cu | 28 +++++------ 4 files changed, 29 insertions(+), 55 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index 8dc897f414..337e9c2415 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -421,7 +421,7 @@ struct Builder { int nHistBins = large_blocks * nbins * colBlks * nclasses; CUDA_CHECK(cudaMemsetAsync(hist, 0, sizeof(BinT) * nHistBins, handle.get_stream())); ML::PUSH_RANGE("computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]"); - ObjectiveT objective(input.numOutputs, params.min_impurity_decrease, params.min_samples_leaf); + ObjectiveT objective(input.numOutputs, params.min_samples_leaf); computeSplitKernel <<>>(hist, params.n_bins, @@ -450,7 +450,7 @@ struct Builder { std::size_t max_batch_size = min(std::size_t(100000), tree->size()); rmm::device_uvector d_tree(max_batch_size, handle.get_stream()); rmm::device_uvector d_instance_ranges(max_batch_size, handle.get_stream()); - ObjectiveT objective(input.numOutputs, params.min_impurity_decrease, params.min_samples_leaf); + ObjectiveT objective(input.numOutputs, params.min_samples_leaf); for (std::size_t batch_begin = 0; batch_begin < tree->size(); batch_begin += max_batch_size) { std::size_t batch_end = min(batch_begin + max_batch_size, tree->size()); std::size_t batch_size = batch_end - batch_begin; diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index da96a4fb10..92146992b7 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -30,7 +30,7 @@ struct CountBin { int x; CountBin(CountBin const&) = default; HDI CountBin(int x_) : x(x_) {} - HDI CountBin() : x(0){}; + HDI CountBin() : x(0) {} DI static void IncrementHistogram(CountBin* hist, int nbins, int b, int label) { @@ -55,7 +55,7 @@ struct AggregateBin { int count; AggregateBin(AggregateBin const&) = default; - HDI AggregateBin() : label_sum(0.0), count(0){}; + HDI AggregateBin() : label_sum(0.0), count(0) {} HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {} DI static void IncrementHistogram(AggregateBin* hist, int nbins, int b, double label) @@ -87,15 +87,12 @@ class GiniObjectiveFunction { using LabelT = LabelT_; using IdxT = IdxT_; IdxT nclasses; - DataT min_impurity_decrease; IdxT min_samples_leaf; public: using BinT = CountBin; - GiniObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) - : nclasses(nclasses), - min_impurity_decrease(min_impurity_decrease), - min_samples_leaf(min_samples_leaf) + GiniObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : nclasses(nclasses), min_samples_leaf(min_samples_leaf) { } @@ -131,12 +128,7 @@ class GiniObjectiveFunction { gain -= val * val; } - // if the gain is not "enough", don't bother! - if (gain <= min_impurity_decrease) - return -std::numeric_limits::max(); - - else - return gain; + return gain; } DI Split Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins) @@ -174,15 +166,12 @@ class EntropyObjectiveFunction { using LabelT = LabelT_; using IdxT = IdxT_; IdxT nclasses; - DataT min_impurity_decrease; IdxT min_samples_leaf; public: using BinT = CountBin; - EntropyObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) - : nclasses(nclasses), - min_impurity_decrease(min_impurity_decrease), - min_samples_leaf(min_samples_leaf) + EntropyObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : nclasses(nclasses), min_samples_leaf(min_samples_leaf) { } DI IdxT NumClasses() const { return nclasses; } @@ -221,9 +210,6 @@ class EntropyObjectiveFunction { } } - // if the gain is not "enough", don't bother! - if (gain <= min_impurity_decrease) return -std::numeric_limits::max(); - return gain; } } @@ -256,15 +242,14 @@ class PoissonObjectiveFunction { using IdxT = IdxT_; private: - DataT min_impurity_decrease; IdxT min_samples_leaf; public: using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); - HDI PoissonObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) - : min_impurity_decrease(min_impurity_decrease), min_samples_leaf(min_samples_leaf) + HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : min_samples_leaf(min_samples_leaf) { } DI IdxT NumClasses() const { return 1; } @@ -304,12 +289,7 @@ class PoissonObjectiveFunction { auto gain = parent_obj - (left_obj + right_obj); gain = gain / len; - // if the gain is not "enough", don't bother! - if (gain <= min_impurity_decrease) - return -std::numeric_limits::max(); - - else - return gain; + return gain; } DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) @@ -335,13 +315,12 @@ class MSEObjectiveFunction { using IdxT = IdxT_; private: - DataT min_impurity_decrease; IdxT min_samples_leaf; public: using BinT = AggregateBin; - HDI MSEObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf) - : min_impurity_decrease(min_impurity_decrease), min_samples_leaf(min_samples_leaf) + HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : min_samples_leaf(min_samples_leaf) { } DI IdxT NumClasses() const { return 1; } @@ -356,16 +335,13 @@ class MSEObjectiveFunction { return -std::numeric_limits::max(); } else { auto label_sum = hist[nbins - 1].label_sum; - auto parent_obj = -label_sum * label_sum * invLen; + auto parent_obj = -label_sum * label_sum / len; auto left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; auto right_label_sum = hist[i].label_sum - label_sum; auto right_obj = -(right_label_sum * right_label_sum) / nRight; gain = parent_obj - (left_obj + right_obj); gain *= invLen; - // if the gain is not "enough", don't bother! - if (gain <= min_impurity_decrease) return -std::numeric_limits::max(); - return gain; } } diff --git a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu index 9c402bec2c..37b9519b8c 100644 --- a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu +++ b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu @@ -279,7 +279,7 @@ TEST_P(TestMetric, RegressionMetricGain) CRITERION split_criterion = GetParam(); - ObjectiveT obj(1, params.min_impurity_decrease, params.min_samples_leaf); + ObjectiveT obj(1, params.min_samples_leaf); size_t smemSize1 = n_bins * sizeof(ObjectiveT::BinT) + // shist size n_bins * sizeof(DataT) + // sbins size sizeof(int); // sDone size diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index bee44777a6..3c38a221b6 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -576,7 +576,6 @@ struct ObjectiveTestParameters { uint64_t seed; int n_bins; int n_classes; - double min_impurity_decrease; int min_samples_leaf; double tolerance; }; @@ -647,9 +646,9 @@ class ObjectiveTest : public ::testing::TestWithParam { (n_right / n) * right_phd); // gain in long form without proxy // edge cases - if (gain <= params.min_impurity_decrease or n_left < params.min_samples_leaf or - n_right < params.min_samples_leaf or label_sum < ObjectiveT::eps_ or - label_sum_right < ObjectiveT::eps_ or label_sum_left < ObjectiveT::eps_) + if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or + label_sum < ObjectiveT::eps_ or label_sum_right < ObjectiveT::eps_ or + label_sum_left < ObjectiveT::eps_) return -std::numeric_limits::max(); else return gain; @@ -690,8 +689,7 @@ class ObjectiveTest : public ::testing::TestWithParam { auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); // edge cases - if (gain <= params.min_impurity_decrease or left_n < params.min_samples_leaf or - right_n < params.min_samples_leaf) { + if (left_n < params.min_samples_leaf or right_n < params.min_samples_leaf) { return -std::numeric_limits::max(); } else { return gain; @@ -731,7 +729,7 @@ class ObjectiveTest : public ::testing::TestWithParam { { srand(params.seed); params = ::testing::TestWithParam::GetParam(); - ObjectiveT objective(params.n_classes, params.min_impurity_decrease, params.min_samples_leaf); + ObjectiveT objective(params.n_classes, params.min_samples_leaf); auto [cdf_hist, pdf_hist] = GenHist(); @@ -749,16 +747,16 @@ class ObjectiveTest : public ::testing::TestWithParam { }; const std::vector poisson_objective_test_parameters = { - {9507819643927052255LLU, 64, 1, 0.0001, 0, 0.00001}, - {9507819643927052259LLU, 128, 1, 0.0001, 1, 0.00001}, - {9507819643927052251LLU, 256, 1, 0.0001, 1, 0.00001}, - {9507819643927052258LLU, 512, 1, 0.0001, 5, 0.00001}, + {9507819643927052255LLU, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 512, 1, 5, 0.00001}, }; const std::vector gini_objective_test_parameters = { - {9507819643927052255LLU, 64, 2, 0.0001, 0, 0.00001}, - {9507819643927052256LLU, 128, 10, 0.0001, 1, 0.00001}, - {9507819643927052257LLU, 256, 100, 0.0001, 1, 0.00001}, - {9507819643927052258LLU, 512, 100, 0.0001, 5, 0.00001}, + {9507819643927052255LLU, 64, 2, 0, 0.00001}, + {9507819643927052256LLU, 128, 10, 1, 0.00001}, + {9507819643927052257LLU, 256, 100, 1, 0.00001}, + {9507819643927052258LLU, 512, 100, 5, 0.00001}, }; // poisson objective test From fd42fb78a344085644f8f430f9d56c7b1ca3d71c Mon Sep 17 00:00:00 2001 From: venkywonka Date: Fri, 17 Sep 2021 17:02:44 +0530 Subject: [PATCH 22/42] fix accuracy bug and dask docstring duplication --- cpp/src/decisiontree/batched-levelalgo/metrics.cuh | 12 ++++++------ python/cuml/dask/ensemble/randomforestclassifier.py | 10 +++------- python/cuml/dask/ensemble/randomforestregressor.py | 4 ++-- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 92146992b7..e87e6b3627 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -334,12 +334,12 @@ class MSEObjectiveFunction { if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { return -std::numeric_limits::max(); } else { - auto label_sum = hist[nbins - 1].label_sum; - auto parent_obj = -label_sum * label_sum / len; - auto left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; - auto right_label_sum = hist[i].label_sum - label_sum; - auto right_obj = -(right_label_sum * right_label_sum) / nRight; - gain = parent_obj - (left_obj + right_obj); + auto label_sum = hist[nbins - 1].label_sum; + DataT parent_obj = -label_sum * label_sum / len; + DataT left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; + DataT right_label_sum = hist[i].label_sum - label_sum; + DataT right_obj = -(right_label_sum * right_label_sum) / nRight; + gain = parent_obj - (left_obj + right_obj); gain *= invLen; return gain; diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 692d9e3a0e..7e57301818 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -74,16 +74,12 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, run different models concurrently in different streams by creating handles in several streams. If it is None, a new one is created. - split_criterion : The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. + split_criterion : int (default = 2) + The criterion used to split nodes. + 0 for GINI, 1 for ENTROPY, 5 for CRITERION_END. 2 and 3 not valid for classification - (default = 0) split_algo : 0 for HIST and 1 for GLOBAL_QUANTILE (default = 1) the algorithm to determine how nodes are split in the tree. - split_criterion : The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, 4 for CRITERION_END. - 2 and 3 not valid for classification - (default = 0) bootstrap : boolean (default = True) Control bootstrapping. If set, each tree in the forest is built diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index c2521d21c7..caf909dc7e 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - from cuml.dask.common.base import DelayedPredictionMixin from cuml.ensemble import RandomForestRegressor as cuRFR from cuml.dask.ensemble.base import \ @@ -74,7 +73,8 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, split_criterion : int (default = 2) The criterion used to split nodes. 0 for GINI, 1 for ENTROPY, - 2 for MSE, 3 for MAE and 4 for POISSON + 2 for MSE, 3 for MAE, 4 for POISSON, + and 5 for CRITERION_END 0 and 1 not valid for regression bootstrap : boolean (default = True) Control bootstrapping. From a31512ddc8bd6d105e5cf14b0ccb0fcb2ddc9d00 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Fri, 17 Sep 2021 17:17:07 +0530 Subject: [PATCH 23/42] fix doctring slip --- python/cuml/dask/ensemble/randomforestclassifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 7e57301818..9f0166561e 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -74,7 +74,7 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, run different models concurrently in different streams by creating handles in several streams. If it is None, a new one is created. - split_criterion : int (default = 2) + split_criterion : int (default = 0) The criterion used to split nodes. 0 for GINI, 1 for ENTROPY, 5 for CRITERION_END. 2 and 3 not valid for classification From 493f847da3aecda97e378480d1cb39a10d3f37b6 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Fri, 17 Sep 2021 22:06:27 +0530 Subject: [PATCH 24/42] merge resolution --- cpp/src/decisiontree/batched-levelalgo/metrics.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 3dbd7809fb..fdcf8c18df 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -304,9 +304,11 @@ class PoissonObjectiveFunction { return sp; } - static DI LabelT LeafPrediction(BinT const* shist, int nclasses) + static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) { - return shist[0].label_sum / shist[0].count; + for (int i = 0; i < nclasses; i++) { + out[i] = shist[i].label_sum / shist[i].count; + } } }; template From aec9d261902f5e0b52bc28ab059191f1c144e93d Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 20 Sep 2021 18:28:40 +0530 Subject: [PATCH 25/42] merge with poisson branch --- .../batched-levelalgo/metrics.cuh | 221 ++++++++++++------ cpp/test/sg/rf_test.cu | 14 ++ 2 files changed, 169 insertions(+), 66 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index fdcf8c18df..610490072c 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -236,65 +236,33 @@ class EntropyObjectiveFunction { } }; +/** @brief The abstract base class for the tweedie family of objective functions: + * mean-squared-error(p=0), poisson(p=1), gamma(p=2) and inverse gaussian(p=3) + **/ template -class PoissonObjectiveFunction { +class TweedieObjectiveFunction { + public: using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; + using BinT = AggregateBin; - private: + protected: IdxT min_samples_leaf; public: - using BinT = AggregateBin; - static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); - HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + HDI TweedieObjectiveFunction(IdxT min_samples_leaf) : min_samples_leaf(min_samples_leaf) { } - DI IdxT NumClasses() const { return 1; } - - /** - * @brief compute the poisson impurity reduction (or purity gain) for each split - * - * @note This method is used to speed up the search for the best split - * by calculating the gain using a proxy poisson half deviance reduction. - * It is a proxy quantity such that the split that maximizes this value - * also maximizes the impurity improvement. It neglects all constant terms - * of the impurity decrease for a given split. - * The Gain is the difference in the proxy impurities of the parent and the - * weighted sum of impurities of its children. - */ - HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) - { - // get the lens' - auto nRight = len - nLeft; - - // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) - return -std::numeric_limits::max(); - - auto label_sum = hist[nbins - 1].label_sum; - auto left_label_sum = (hist[i].label_sum); - auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); - // label sum cannot be non-positive - if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) - return -std::numeric_limits::max(); + DI IdxT NumClasses() const { return 1; } - // compute the gain to be - DataT parent_obj = -label_sum * raft::myLog(label_sum / len); - DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); - DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - auto gain = parent_obj - (left_obj + right_obj); - gain = gain / len; + HDI virtual DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const = 0; - return gain; - } - - DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) + DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -310,32 +278,29 @@ class PoissonObjectiveFunction { out[i] = shist[i].label_sum / shist[i].count; } } + }; template -class MSEObjectiveFunction { +class MSEObjectiveFunction : public TweedieObjectiveFunction { public: using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; - - private: - IdxT min_samples_leaf; - - public: + // using BinT = typename TweedieObjectiveFunction::BinT; using BinT = AggregateBin; + HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - : min_samples_leaf(min_samples_leaf) + : TweedieObjectiveFunction{min_samples_leaf} { } - DI IdxT NumClasses() const { return 1; } - HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const { auto gain{DataT(0)}; auto nRight{len - nLeft}; auto invLen{DataT(1.0) / len}; // if there aren't enough samples in this split, don't bother! - if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { + if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) { return -std::numeric_limits::max(); } else { auto label_sum = hist[nbins - 1].label_sum; @@ -349,24 +314,148 @@ class MSEObjectiveFunction { return gain; } } +}; - DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) +template +class PoissonObjectiveFunction : public TweedieObjectiveFunction { + public: + using DataT = DataT_; + using LabelT = LabelT_; + using IdxT = IdxT_; + // using BinT = typename TweedieObjectiveFunction::BinT; + using BinT = AggregateBin; + + static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + + HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : TweedieObjectiveFunction{min_samples_leaf} { - Split sp; - for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - auto nLeft = shist[i].count; - sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); - } - return sp; } - static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) + /** + * @brief compute the poisson impurity reduction (or purity gain) for each split + * + * @note This method is used to speed up the search for the best split + * by calculating the gain using a proxy poisson half deviance reduction. + * It is a proxy quantity such that the split that maximizes this value + * also maximizes the impurity improvement. It neglects all constant terms + * of the impurity decrease for a given split. + * The Gain is the difference in the proxy impurities of the parent and the + * weighted sum of impurities of its children. + */ + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const { - for (int i = 0; i < nclasses; i++) { - out[i] = shist[i].label_sum / shist[i].count; - } + // get the lens' + auto nRight = len - nLeft; + + // if there aren't enough samples in this split, don't bother! + if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) + return -std::numeric_limits::max(); + + auto label_sum = hist[nbins - 1].label_sum; + auto left_label_sum = (hist[i].label_sum); + auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + + // label sum cannot be non-positive + if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) + return -std::numeric_limits::max(); + + // compute the gain to be + DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); + DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); + auto gain = parent_obj - (left_obj + right_obj); + gain = gain / len; + + return gain; } + }; -} // namespace DT -} // namespace ML +// template +// class GammaObjectiveFunction : public TweedieObjectiveFunction { +// public: +// using DataT = DataT_; +// using LabelT = LabelT_; +// using IdxT = IdxT_; +// // using BinT = typename TweedieObjectiveFunction::BinT; +// using BinT = AggregateBin; +// static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + +// HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) +// : TweedieObjectiveFunction{min_samples_leaf} +// { +// } + +// HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const +// { +// // get the lens' +// auto nRight = len - nLeft; + +// // if there aren't enough samples in this split, don't bother! +// if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) +// return -std::numeric_limits::max(); + +// auto label_sum = hist[nbins - 1].label_sum; +// auto left_label_sum = (hist[i].label_sum); +// auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + +// // label sum cannot be non-positive +// if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) +// return -std::numeric_limits::max(); + +// // compute the gain to be +// DataT parent_obj = len * raft::myLog(label_sum / len); +// DataT left_obj = nLeft * raft::myLog(left_label_sum / nLeft); +// DataT right_obj = nRight * raft::myLog(right_label_sum / nRight); +// auto gain = parent_obj - (left_obj + right_obj); +// gain = gain / len; + +// return gain; +// } +// }; + +// template +// class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction { +// public: +// using DataT = DataT_; +// using LabelT = LabelT_; +// using IdxT = IdxT_; +// // using BinT = typename TweedieObjectiveFunction::BinT; +// using BinT = AggregateBin; +// static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + +// HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) +// : TweedieObjectiveFunction{min_samples_leaf} +// { +// } + +// HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const +// { +// // get the lens' +// auto nRight = len - nLeft; + +// // if there aren't enough samples in this split, don't bother! +// if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) +// return -std::numeric_limits::max(); + +// auto label_sum = hist[nbins - 1].label_sum; +// auto left_label_sum = (hist[i].label_sum); +// auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + +// // label sum cannot be non-positive +// if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) +// return -std::numeric_limits::max(); + +// // compute the gain to be +// DataT parent_obj = len * raft::myLog(label_sum / len); +// DataT left_obj = nLeft * raft::myLog(left_label_sum / nLeft); +// DataT right_obj = nRight * raft::myLog(right_label_sum / nRight); +// auto gain = parent_obj - (left_obj + right_obj); +// gain = gain / len; + +// return gain; +// } +// }; +} // end namespace DT +} // end namespace ML \ No newline at end of file diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 4f20085a3d..4010bd0697 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -289,23 +290,29 @@ class RfSpecialisedTest { // Current model should be at least as accurate as a model with depth - 1 void TestAccuracyImprovement() { + CUML_LOG_TRACE("inside test accuracy improvement: %d", __LINE__); if (params.max_depth <= 1) { return; } // avereraging between models can introduce variance if (params.n_trees > 1) { return; } // accuracy is not guaranteed to improve with bootstrapping if (params.bootstrap) { return; } + CUML_LOG_TRACE("%d", __LINE__); raft::handle_t handle(params.n_streams); RfTestParams alt_params = params; alt_params.max_depth--; + CUML_LOG_TRACE("%d", __LINE__); auto [alt_forest, alt_predictions, alt_metrics] = TrainScore(handle, alt_params, X.data().get(), X_transpose.data().get(), y.data().get()); + CUML_LOG_TRACE("%d", __LINE__); double eps = 1e-8; if (params.split_criterion == MSE) { EXPECT_LE(training_metrics.mean_squared_error, alt_metrics.mean_squared_error + eps); } else if (params.split_criterion == MAE) { EXPECT_LE(training_metrics.mean_abs_error, alt_metrics.mean_abs_error + eps); } else { + CUML_LOG_TRACE("%d", __LINE__); EXPECT_GE(training_metrics.accuracy, alt_metrics.accuracy); + CUML_LOG_TRACE("%d", __LINE__); } } // Regularisation parameters are working correctly @@ -428,12 +435,19 @@ class RfSpecialisedTest { } void Test() { + CUML_LOG_TRACE("inside test"); TestAccuracyImprovement(); + CUML_LOG_TRACE("%d", __LINE__); TestDeterminism(); + CUML_LOG_TRACE("%d", __LINE__); TestMinImpurity(); + CUML_LOG_TRACE("%d", __LINE__); TestTreeSize(); + CUML_LOG_TRACE("%d", __LINE__); TestInstanceCounts(); + CUML_LOG_TRACE("%d", __LINE__); TestFilPredict(); + CUML_LOG_TRACE("%d", __LINE__); } RF_metrics training_metrics; From db09e0f63b49e2fa25aa616e862c66e1f69ac035 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 21 Sep 2021 21:32:06 +0530 Subject: [PATCH 26/42] add tweedie losses --- cpp/include/cuml/tree/algo_helper.h | 2 + .../batched-levelalgo/metrics.cuh | 229 ++++++++++-------- cpp/src/decisiontree/decisiontree.cuh | 26 ++ cpp/test/sg/rf_test.cu | 145 +++++++++-- 4 files changed, 288 insertions(+), 114 deletions(-) diff --git a/cpp/include/cuml/tree/algo_helper.h b/cpp/include/cuml/tree/algo_helper.h index ae7aa9b9d1..483f936118 100644 --- a/cpp/include/cuml/tree/algo_helper.h +++ b/cpp/include/cuml/tree/algo_helper.h @@ -23,6 +23,8 @@ enum CRITERION { MSE, MAE, POISSON, + GAMMA, + INVERSE_GAUSSIAN, CRITERION_END, }; diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 610490072c..b92459a081 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -246,13 +246,12 @@ class TweedieObjectiveFunction { using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; - using BinT = AggregateBin; + using BinT = AggregateBin; protected: IdxT min_samples_leaf; public: - HDI TweedieObjectiveFunction(IdxT min_samples_leaf) : min_samples_leaf(min_samples_leaf) { @@ -260,17 +259,6 @@ class TweedieObjectiveFunction { DI IdxT NumClasses() const { return 1; } - HDI virtual DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const = 0; - - DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const - { - Split sp; - for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { - auto nLeft = shist[i].count; - sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); - } - return sp; - } static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) { @@ -280,6 +268,7 @@ class TweedieObjectiveFunction { } }; + template class MSEObjectiveFunction : public TweedieObjectiveFunction { public: @@ -314,6 +303,17 @@ class MSEObjectiveFunction : public TweedieObjectiveFunction Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + { + Split sp; + for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { + auto nLeft = shist[i].count; + sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + } + return sp; + } + }; template @@ -370,92 +370,125 @@ class PoissonObjectiveFunction : public TweedieObjectiveFunction Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + { + Split sp; + for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { + auto nLeft = shist[i].count; + sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + } + return sp; + } + + }; -// template -// class GammaObjectiveFunction : public TweedieObjectiveFunction { -// public: -// using DataT = DataT_; -// using LabelT = LabelT_; -// using IdxT = IdxT_; -// // using BinT = typename TweedieObjectiveFunction::BinT; -// using BinT = AggregateBin; -// static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); - -// HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) -// : TweedieObjectiveFunction{min_samples_leaf} -// { -// } - -// HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const -// { -// // get the lens' -// auto nRight = len - nLeft; - -// // if there aren't enough samples in this split, don't bother! -// if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) -// return -std::numeric_limits::max(); - -// auto label_sum = hist[nbins - 1].label_sum; -// auto left_label_sum = (hist[i].label_sum); -// auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); - -// // label sum cannot be non-positive -// if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) -// return -std::numeric_limits::max(); - -// // compute the gain to be -// DataT parent_obj = len * raft::myLog(label_sum / len); -// DataT left_obj = nLeft * raft::myLog(left_label_sum / nLeft); -// DataT right_obj = nRight * raft::myLog(right_label_sum / nRight); -// auto gain = parent_obj - (left_obj + right_obj); -// gain = gain / len; - -// return gain; -// } -// }; - -// template -// class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction { -// public: -// using DataT = DataT_; -// using LabelT = LabelT_; -// using IdxT = IdxT_; -// // using BinT = typename TweedieObjectiveFunction::BinT; -// using BinT = AggregateBin; -// static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); - -// HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) -// : TweedieObjectiveFunction{min_samples_leaf} -// { -// } - -// HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const -// { -// // get the lens' -// auto nRight = len - nLeft; - -// // if there aren't enough samples in this split, don't bother! -// if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) -// return -std::numeric_limits::max(); - -// auto label_sum = hist[nbins - 1].label_sum; -// auto left_label_sum = (hist[i].label_sum); -// auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); - -// // label sum cannot be non-positive -// if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) -// return -std::numeric_limits::max(); - -// // compute the gain to be -// DataT parent_obj = len * raft::myLog(label_sum / len); -// DataT left_obj = nLeft * raft::myLog(left_label_sum / nLeft); -// DataT right_obj = nRight * raft::myLog(right_label_sum / nRight); -// auto gain = parent_obj - (left_obj + right_obj); -// gain = gain / len; - -// return gain; -// } -// }; +template +class GammaObjectiveFunction : public TweedieObjectiveFunction { + public: + using DataT = DataT_; + using LabelT = LabelT_; + using IdxT = IdxT_; + using BinT = AggregateBin; + static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + + HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : TweedieObjectiveFunction{min_samples_leaf} + { + } + + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const + { + printf("inside GAMMA::GainPerSplit\n"); + // get the lens' + IdxT nRight = len - nLeft; + + // if there aren't enough samples in this split, don't bother! + if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) + return -std::numeric_limits::max(); + + DataT label_sum = hist[nbins - 1].label_sum; + DataT left_label_sum = (hist[i].label_sum); + DataT right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + + // label sum cannot be non-positive + if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) + return -std::numeric_limits::max(); + + // compute the gain to be + DataT parent_obj = raft::myLog(label_sum / len); + printf("parent_obj: %f\n", parent_obj); + DataT left_obj = (DataT(nLeft)/DataT(len)) * raft::myLog(left_label_sum / nLeft); + printf("left_obj: %f\n", left_obj); + DataT right_obj = (DataT(nRight)/DataT(len)) * raft::myLog(right_label_sum / nRight); + printf("right_obj: %f\n", right_obj); + DataT gain = parent_obj - (left_obj + right_obj); + // gain = gain / DataT(len); + + return gain; + } + + DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + { + Split sp; + for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { + auto nLeft = shist[i].count; + sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + } + return sp; + } +}; + +template +class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction { + public: + using DataT = DataT_; + using LabelT = LabelT_; + using IdxT = IdxT_; + using BinT = AggregateBin; + static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + + HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : TweedieObjectiveFunction{min_samples_leaf} + { + } + + HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const + { + // get the lens' + auto nRight = len - nLeft; + + // if there aren't enough samples in this split, don't bother! + if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) + return -std::numeric_limits::max(); + + auto label_sum = hist[nbins - 1].label_sum; + auto left_label_sum = (hist[i].label_sum); + auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum); + + // label sum cannot be non-positive + if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_) + return -std::numeric_limits::max(); + + // compute the gain to be + DataT parent_obj = DataT(len) * DataT(len) / label_sum; + DataT left_obj = DataT(nLeft) * DataT(nLeft) / left_label_sum; + DataT right_obj = DataT(nRight) * DataT(nRight) / right_label_sum; + auto gain = parent_obj - (left_obj + right_obj); + gain = gain / len; + + return gain; + } + + DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + { + Split sp; + for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { + auto nLeft = shist[i].count; + sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + } + return sp; + } +}; } // end namespace DT } // end namespace ML \ No newline at end of file diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh index e2284cc14a..33aeac0d75 100644 --- a/cpp/src/decisiontree/decisiontree.cuh +++ b/cpp/src/decisiontree/decisiontree.cuh @@ -303,6 +303,32 @@ class DecisionTree { unique_labels, quantiles) .train(); + } else if (params.split_criterion == CRITERION::GAMMA) { + return Builder>(handle, + treeid, + seed, + params, + data, + labels, + nrows, + ncols, + rowids, + unique_labels, + quantiles) + .train(); + } else if (params.split_criterion == CRITERION::INVERSE_GAUSSIAN) { + return Builder>(handle, + treeid, + seed, + params, + data, + labels, + nrows, + ncols, + rowids, + unique_labels, + quantiles) + .train(); } else { ASSERT(false, "Unknown split criterion."); } diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 4010bd0697..5472ffcd95 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #include @@ -285,34 +284,29 @@ class RfSpecialisedTest { std::tie(forest, predictions, training_metrics) = TrainScore(handle, params, X.data().get(), X_transpose.data().get(), y.data().get()); + Test(); } // Current model should be at least as accurate as a model with depth - 1 void TestAccuracyImprovement() { - CUML_LOG_TRACE("inside test accuracy improvement: %d", __LINE__); if (params.max_depth <= 1) { return; } // avereraging between models can introduce variance if (params.n_trees > 1) { return; } // accuracy is not guaranteed to improve with bootstrapping if (params.bootstrap) { return; } - CUML_LOG_TRACE("%d", __LINE__); raft::handle_t handle(params.n_streams); RfTestParams alt_params = params; alt_params.max_depth--; - CUML_LOG_TRACE("%d", __LINE__); auto [alt_forest, alt_predictions, alt_metrics] = TrainScore(handle, alt_params, X.data().get(), X_transpose.data().get(), y.data().get()); - CUML_LOG_TRACE("%d", __LINE__); double eps = 1e-8; if (params.split_criterion == MSE) { EXPECT_LE(training_metrics.mean_squared_error, alt_metrics.mean_squared_error + eps); } else if (params.split_criterion == MAE) { EXPECT_LE(training_metrics.mean_abs_error, alt_metrics.mean_abs_error + eps); } else { - CUML_LOG_TRACE("%d", __LINE__); EXPECT_GE(training_metrics.accuracy, alt_metrics.accuracy); - CUML_LOG_TRACE("%d", __LINE__); } } // Regularisation parameters are working correctly @@ -435,19 +429,12 @@ class RfSpecialisedTest { } void Test() { - CUML_LOG_TRACE("inside test"); TestAccuracyImprovement(); - CUML_LOG_TRACE("%d", __LINE__); TestDeterminism(); - CUML_LOG_TRACE("%d", __LINE__); TestMinImpurity(); - CUML_LOG_TRACE("%d", __LINE__); TestTreeSize(); - CUML_LOG_TRACE("%d", __LINE__); TestInstanceCounts(); - CUML_LOG_TRACE("%d", __LINE__); TestFilPredict(); - CUML_LOG_TRACE("%d", __LINE__); } RF_metrics training_metrics; @@ -500,7 +487,8 @@ std::vector min_samples_split = {2, 10}; std::vector min_impurity_decrease = {0.0f, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; std::vector split_criterion = { - CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; + // CRITERION::POISSON, + CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; std::vector seed = {0, 17}; std::vector n_labels = {2, 10, 20}; std::vector double_precision = {false, true}; @@ -696,7 +684,12 @@ class ObjectiveTest : public ::testing::TestWithParam { ObjectiveTestParameters params; public: - auto RandUnder(int const end = 10000) { return rand() % end; } + auto RandUnder(int const end = 100000) { return rand() % end; } + + auto GenSortedData() + { + + } auto GenHist() { @@ -720,6 +713,91 @@ class ObjectiveTest : public ::testing::TestWithParam { return std::make_pair(cdf_hist, pdf_hist); } + auto InverseGaussianHalfDeviance( + std::vector const& hist) // 1/n * 2 * sum((y - y_pred) * (y - y_pred)/(y * (y_pred) * (y_pred))) + { + BinT aggregate{BinT()}; + aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); + assert(aggregate.count > 0); + DataT const y_mean = aggregate.label_sum / aggregate.count; + auto ighd{DataT(0.0)}; // ighd: inverse gaussian half deviance + + std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { + ighd += (h.label_sum - y_mean) * (h.label_sum - y_mean) / (h.label_sum * y_mean * y_mean); // unit deviance + }); + + ighd /= aggregate.count; + return std::make_tuple( + ighd, aggregate.label_sum, static_cast(aggregate.count)); + } + + auto InverseGaussianGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) + { + std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; + std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; + + auto [parent_ighd, label_sum, n] = InverseGaussianHalfDeviance(pdf_hist); + auto [left_ighd, label_sum_left, n_left] = InverseGaussianHalfDeviance(left_pdf_hist); + auto [right_ighd, label_sum_right, n_right] = InverseGaussianHalfDeviance(right_pdf_hist); + + + auto gain = parent_ighd - ((n_left / n) * left_ighd + // the minimizing objective function is half deviance + (n_right / n) * right_ighd); // gain in long form without proxy + + // edge cases + if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or + label_sum < ObjectiveT::eps_ or label_sum_right < ObjectiveT::eps_ or + label_sum_left < ObjectiveT::eps_) + return -std::numeric_limits::max(); + else + return gain; + } + + auto GammaHalfDeviance( + std::vector const& hist) // 1/n * 2 * sum(log(y_pred/y_true) + y_true/y_pred - 1) + { + BinT aggregate{BinT()}; + aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); + assert(aggregate.count > 0); + DataT const y_mean = aggregate.label_sum / aggregate.count; + auto mean_gamma_deviance{DataT(0.0)}; + + std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { + auto log_y = raft::myLog(h.label_sum ? h.label_sum : DataT(1.0)); // we don't want nans + mean_gamma_deviance += h.count*raft::myLog(y_mean) - log_y + h.label_sum/y_mean - DataT(1); // InvGauss formula for each bin + }); + + mean_gamma_deviance /= aggregate.count; + // mean_gamma_deviance = raft::myLog(y_mean); + return std::make_tuple( + mean_gamma_deviance, aggregate.label_sum, static_cast(aggregate.count)); + } + + auto GammaGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) + { + std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; + std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; + + auto [parent_ghd, label_sum, n] = GammaHalfDeviance(pdf_hist); + auto [left_ghd, label_sum_left, n_left] = GammaHalfDeviance(left_pdf_hist); + auto [right_ghd, label_sum_right, n_right] = GammaHalfDeviance(right_pdf_hist); + + + auto gain = parent_ghd - ((n_left / n) * left_ghd + // the minimizing objective function is half deviance + (n_right / n) * right_ghd); // gain in long form without proxy + // DataT gain = n * parent_ghd - (n_left * left_ghd + n_right * right_ghd); + // gain = gain / n; + + // edge cases + if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or + label_sum < ObjectiveT::eps_ or label_sum_right < ObjectiveT::eps_ or + label_sum_left < ObjectiveT::eps_) + return -std::numeric_limits::max(); + else + return gain; + } + + auto PoissonHalfDeviance( std::vector const& hist) // 1/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) { @@ -808,6 +886,14 @@ class ObjectiveTest : public ::testing::TestWithParam { PoissonObjectiveFunction>::value) // poisson { return PoissonGroundTruthGain(pdf_hist, split_bin_index); + } else if constexpr (std::is_same>::value) // gini + { + return GammaGroundTruthGain(pdf_hist, split_bin_index); + } else if constexpr (std::is_same>::value) // gini + { + return InverseGaussianGroundTruthGain(pdf_hist, split_bin_index); } else if constexpr (std::is_same>::value) // gini { @@ -858,6 +944,21 @@ const std::vector poisson_objective_test_parameters = { {9507819643927052251LLU, 256, 1, 1, 0.00001}, {9507819643927052258LLU, 512, 1, 5, 0.00001}, }; + +const std::vector gamma_objective_test_parameters = { + {9507819643927052255LLU, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 512, 1, 5, 0.00001}, +}; + +const std::vector invgauss_objective_test_parameters = { + {9507819643927052255LLU, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 512, 1, 5, 0.00001}, +}; + const std::vector gini_objective_test_parameters = { {9507819643927052255LLU, 64, 2, 0, 0.00001}, {9507819643927052256LLU, 128, 10, 1, 0.00001}, @@ -871,6 +972,18 @@ TEST_P(PoissonObjectiveTestD, poissonObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, PoissonObjectiveTestD, ::testing::ValuesIn(poisson_objective_test_parameters)); +// gamma objective test +typedef ObjectiveTest> GammaObjectiveTestD; +TEST_P(GammaObjectiveTestD, GammaObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + GammaObjectiveTestD, + ::testing::ValuesIn(gamma_objective_test_parameters)); +// InvGauss objective test +typedef ObjectiveTest> InverseGaussianObjectiveTestD; +TEST_P(InverseGaussianObjectiveTestD, InverseGaussianObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + InverseGaussianObjectiveTestD, + ::testing::ValuesIn(invgauss_objective_test_parameters)); // gini objective test typedef ObjectiveTest> GiniObjectiveTestD; From e63754a281cee2eec8c54a80b25f662b56d414a5 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Wed, 22 Sep 2021 22:30:33 +0530 Subject: [PATCH 27/42] refactor unit tests --- .../batched-levelalgo/metrics.cuh | 55 ++-- cpp/src/decisiontree/decisiontree.cuh | 40 +-- cpp/test/sg/rf_test.cu | 272 ++++++++++-------- 3 files changed, 188 insertions(+), 179 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index b92459a081..f9372a9a4f 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -241,7 +241,6 @@ class EntropyObjectiveFunction { **/ template class TweedieObjectiveFunction { - public: using DataT = DataT_; using LabelT = LabelT_; @@ -252,21 +251,16 @@ class TweedieObjectiveFunction { IdxT min_samples_leaf; public: - HDI TweedieObjectiveFunction(IdxT min_samples_leaf) - : min_samples_leaf(min_samples_leaf) - { - } + HDI TweedieObjectiveFunction(IdxT min_samples_leaf) : min_samples_leaf(min_samples_leaf) {} DI IdxT NumClasses() const { return 1; } - static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) { for (int i = 0; i < nclasses; i++) { out[i] = shist[i].label_sum / shist[i].count; } } - }; template @@ -304,7 +298,8 @@ class MSEObjectiveFunction : public TweedieObjectiveFunction Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + DI Split Gain( + BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -313,7 +308,6 @@ class MSEObjectiveFunction : public TweedieObjectiveFunction @@ -370,7 +364,8 @@ class PoissonObjectiveFunction : public TweedieObjectiveFunction Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + DI Split Gain( + BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -379,17 +374,15 @@ class PoissonObjectiveFunction : public TweedieObjectiveFunction class GammaObjectiveFunction : public TweedieObjectiveFunction { public: - using DataT = DataT_; - using LabelT = LabelT_; - using IdxT = IdxT_; - using BinT = AggregateBin; + using DataT = DataT_; + using LabelT = LabelT_; + using IdxT = IdxT_; + using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) @@ -399,7 +392,6 @@ class GammaObjectiveFunction : public TweedieObjectiveFunction Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + DI Split Gain( + BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { @@ -442,10 +432,10 @@ class GammaObjectiveFunction : public TweedieObjectiveFunction class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction { public: - using DataT = DataT_; - using LabelT = LabelT_; - using IdxT = IdxT_; - using BinT = AggregateBin; + using DataT = DataT_; + using LabelT = LabelT_; + using IdxT = IdxT_; + using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) @@ -471,16 +461,17 @@ class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction::max(); // compute the gain to be - DataT parent_obj = DataT(len) * DataT(len) / label_sum; - DataT left_obj = DataT(nLeft) * DataT(nLeft) / left_label_sum; - DataT right_obj = DataT(nRight) * DataT(nRight) / right_label_sum; + DataT parent_obj = -DataT(len) * DataT(len) / label_sum; + DataT left_obj = -DataT(nLeft) * DataT(nLeft) / left_label_sum; + DataT right_obj = -DataT(nRight) * DataT(nRight) / right_label_sum; auto gain = parent_obj - (left_obj + right_obj); gain = gain / len; return gain; } - DI Split Gain(BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const + DI Split Gain( + BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const { Split sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh index 33aeac0d75..978839e975 100644 --- a/cpp/src/decisiontree/decisiontree.cuh +++ b/cpp/src/decisiontree/decisiontree.cuh @@ -305,29 +305,29 @@ class DecisionTree { .train(); } else if (params.split_criterion == CRITERION::GAMMA) { return Builder>(handle, - treeid, - seed, - params, - data, - labels, - nrows, - ncols, - rowids, - unique_labels, - quantiles) + treeid, + seed, + params, + data, + labels, + nrows, + ncols, + rowids, + unique_labels, + quantiles) .train(); } else if (params.split_criterion == CRITERION::INVERSE_GAUSSIAN) { return Builder>(handle, - treeid, - seed, - params, - data, - labels, - nrows, - ncols, - rowids, - unique_labels, - quantiles) + treeid, + seed, + params, + data, + labels, + nrows, + ncols, + rowids, + unique_labels, + quantiles) .train(); } else { ASSERT(false, "Unknown split criterion."); diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 5472ffcd95..89afe08eae 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -13,8 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include +#include +#include #include #include @@ -284,7 +285,6 @@ class RfSpecialisedTest { std::tie(forest, predictions, training_metrics) = TrainScore(handle, params, X.data().get(), X_transpose.data().get(), y.data().get()); - Test(); } // Current model should be at least as accurate as a model with depth - 1 @@ -488,7 +488,9 @@ std::vector min_impurity_decrease = {0.0f, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; std::vector split_criterion = { // CRITERION::POISSON, - CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY}; + CRITERION::MSE, + CRITERION::GINI, + CRITERION::ENTROPY}; std::vector seed = {0, 17}; std::vector n_labels = {2, 10, 20}; std::vector double_precision = {false, true}; @@ -668,6 +670,7 @@ namespace DT { struct ObjectiveTestParameters { uint64_t seed; + int n_rows; int n_bins; int n_classes; int min_samples_leaf; @@ -686,26 +689,52 @@ class ObjectiveTest : public ::testing::TestWithParam { public: auto RandUnder(int const end = 100000) { return rand() % end; } - auto GenSortedData() + auto GenRandomData() { - + std::default_random_engine rng; + std::vector data(params.n_rows); + if constexpr (std::is_same::value) // classification case + { + for (auto& iter : data) { + iter = RandUnder(params.n_classes); + } + } else { + std::normal_distribution normal(1.0, 2.0); + for (auto& iter : data) { + auto rand_element(DataT(0)); + while (1) { + rand_element = normal(rng); + if (rand_element > 0) break; // only positive random numbers + } + iter = rand_element; + } + } + return data; } - auto GenHist() + auto GenHist(std::vector data) { std::vector cdf_hist, pdf_hist; for (auto c = 0; c < params.n_classes; ++c) { for (auto b = 0; b < params.n_bins; ++b) { - if constexpr (std::is_same::value) - pdf_hist.emplace_back(RandUnder()); - else - pdf_hist.emplace_back(static_cast(RandUnder()), RandUnder()); + IdxT bin_width = raft::ceildiv(params.n_rows, params.n_bins); + auto data_begin = data.begin() + b * bin_width; + auto data_end = data_begin + bin_width; + if constexpr (std::is_same::value) { // classification case + auto count(IdxT(0)); + std::for_each(data_begin, data_end, [&](auto d) { + if (d == c) ++count; + }); + pdf_hist.emplace_back(count); + } else { // regression case + auto label_sum(DataT(0)); + label_sum = std::accumulate(data_begin, data_end, DataT(0)); + pdf_hist.emplace_back(label_sum, bin_width); + } auto cumulative = b > 0 ? cdf_hist.back() : BinT(); - cdf_hist.emplace_back(pdf_hist.empty() ? BinT() : pdf_hist.back()); - cdf_hist.back() += cumulative; } } @@ -714,35 +743,34 @@ class ObjectiveTest : public ::testing::TestWithParam { } auto InverseGaussianHalfDeviance( - std::vector const& hist) // 1/n * 2 * sum((y - y_pred) * (y - y_pred)/(y * (y_pred) * (y_pred))) + std::vector const& + data) // 1/n * 2 * sum((y - y_pred) * (y - y_pred)/(y * (y_pred) * (y_pred))) { - BinT aggregate{BinT()}; - aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); - assert(aggregate.count > 0); - DataT const y_mean = aggregate.label_sum / aggregate.count; - auto ighd{DataT(0.0)}; // ighd: inverse gaussian half deviance - - std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { - ighd += (h.label_sum - y_mean) * (h.label_sum - y_mean) / (h.label_sum * y_mean * y_mean); // unit deviance + DataT sum = std::accumulate(data.begin(), data.end(), DataT(0)); + DataT const mean = sum / data.size(); + auto ighd{DataT(0.0)}; // ighd: inverse gaussian half deviance + + std::for_each(data.begin(), data.end(), [&](auto d) { + ighd += (d - mean) * (d - mean) / (d * mean * mean); // unit deviance }); - ighd /= aggregate.count; - return std::make_tuple( - ighd, aggregate.label_sum, static_cast(aggregate.count)); + ighd /= data.size(); + return std::make_tuple(ighd, sum, DataT(data.size())); } - auto InverseGaussianGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) + auto InverseGaussianGroundTruthGain(std::vector const& data, std::size_t split_bin_index) { - std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; - std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; + auto bin_width = raft::ceildiv(params.n_rows, params.n_bins); + std::vector left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width); + std::vector right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end()); - auto [parent_ighd, label_sum, n] = InverseGaussianHalfDeviance(pdf_hist); - auto [left_ighd, label_sum_left, n_left] = InverseGaussianHalfDeviance(left_pdf_hist); - auto [right_ighd, label_sum_right, n_right] = InverseGaussianHalfDeviance(right_pdf_hist); + auto [parent_ighd, label_sum, n] = InverseGaussianHalfDeviance(data); + auto [left_ighd, label_sum_left, n_left] = InverseGaussianHalfDeviance(left_data); + auto [right_ighd, label_sum_right, n_right] = InverseGaussianHalfDeviance(right_data); - - auto gain = parent_ighd - ((n_left / n) * left_ighd + // the minimizing objective function is half deviance - (n_right / n) * right_ighd); // gain in long form without proxy + auto gain = parent_ighd - + ((n_left / n) * left_ighd + // the minimizing objective function is half deviance + (n_right / n) * right_ighd); // gain in long form without proxy // edge cases if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or @@ -754,39 +782,35 @@ class ObjectiveTest : public ::testing::TestWithParam { } auto GammaHalfDeviance( - std::vector const& hist) // 1/n * 2 * sum(log(y_pred/y_true) + y_true/y_pred - 1) + std::vector const& data) // 1/n * 2 * sum(log(y_pred/y_true) + y_true/y_pred - 1) { - BinT aggregate{BinT()}; - aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); - assert(aggregate.count > 0); - DataT const y_mean = aggregate.label_sum / aggregate.count; - auto mean_gamma_deviance{DataT(0.0)}; - - std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { - auto log_y = raft::myLog(h.label_sum ? h.label_sum : DataT(1.0)); // we don't want nans - mean_gamma_deviance += h.count*raft::myLog(y_mean) - log_y + h.label_sum/y_mean - DataT(1); // InvGauss formula for each bin + DataT sum(0); + sum = std::accumulate(data.begin(), data.end(), DataT(0)); + DataT const mean = sum / data.size(); + DataT ghd(0); // gamma half deviance + + std::for_each(data.begin(), data.end(), [&](auto& element) { + auto log_y = raft::myLog(element ? element : DataT(1.0)); + ghd += raft::myLog(mean) - log_y + element / mean - 1; }); - mean_gamma_deviance /= aggregate.count; - // mean_gamma_deviance = raft::myLog(y_mean); - return std::make_tuple( - mean_gamma_deviance, aggregate.label_sum, static_cast(aggregate.count)); + ghd /= data.size(); + return std::make_tuple(ghd, sum, DataT(data.size())); } - auto GammaGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) + auto GammaGroundTruthGain(std::vector const& data, std::size_t split_bin_index) { - std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; - std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; - - auto [parent_ghd, label_sum, n] = GammaHalfDeviance(pdf_hist); - auto [left_ghd, label_sum_left, n_left] = GammaHalfDeviance(left_pdf_hist); - auto [right_ghd, label_sum_right, n_right] = GammaHalfDeviance(right_pdf_hist); + auto bin_width = raft::ceildiv(params.n_rows, params.n_bins); + std::vector left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width); + std::vector right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end()); + auto [parent_ghd, label_sum, n] = GammaHalfDeviance(data); + auto [left_ghd, label_sum_left, n_left] = GammaHalfDeviance(left_data); + auto [right_ghd, label_sum_right, n_right] = GammaHalfDeviance(right_data); - auto gain = parent_ghd - ((n_left / n) * left_ghd + // the minimizing objective function is half deviance - (n_right / n) * right_ghd); // gain in long form without proxy - // DataT gain = n * parent_ghd - (n_left * left_ghd + n_right * right_ghd); - // gain = gain / n; + auto gain = + parent_ghd - ((n_left / n) * left_ghd + // the minimizing objective function is half deviance + (n_right / n) * right_ghd); // gain in long form without proxy // edge cases if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or @@ -797,34 +821,31 @@ class ObjectiveTest : public ::testing::TestWithParam { return gain; } - auto PoissonHalfDeviance( - std::vector const& hist) // 1/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) + std::vector const& data) // 1/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) { - BinT aggregate{BinT()}; - aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); - assert(aggregate.count > 0); - auto const y_mean = aggregate.label_sum / aggregate.count; + DataT sum = std::accumulate(data.begin(), data.end(), DataT(0)); + auto const mean = sum / data.size(); auto poisson_half_deviance{DataT(0.0)}; - std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { - auto log_y = raft::myLog(h.label_sum ? h.label_sum : DataT(1.0)); // we don't want nans - poisson_half_deviance += h.label_sum * (log_y - raft::myLog(y_mean)) + y_mean - h.label_sum; + std::for_each(data.begin(), data.end(), [&](auto d) { + auto log_y = raft::myLog(d ? d : DataT(1.0)); // we don't want nans + poisson_half_deviance += d * (log_y - raft::myLog(mean)) + mean - d; }); - poisson_half_deviance /= aggregate.count; - return std::make_tuple( - poisson_half_deviance, aggregate.label_sum, static_cast(aggregate.count)); + poisson_half_deviance /= data.size(); + return std::make_tuple(poisson_half_deviance, sum, DataT(data.size())); } - auto PoissonGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) + auto PoissonGroundTruthGain(std::vector const& data, std::size_t split_bin_index) { - std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; - std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; + auto bin_width = raft::ceildiv(params.n_rows, params.n_bins); + std::vector left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width); + std::vector right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end()); - auto [parent_phd, label_sum, n] = PoissonHalfDeviance(pdf_hist); - auto [left_phd, label_sum_left, n_left] = PoissonHalfDeviance(left_pdf_hist); - auto [right_phd, label_sum_right, n_right] = PoissonHalfDeviance(right_pdf_hist); + auto [parent_phd, label_sum, n] = PoissonHalfDeviance(data); + auto [left_phd, label_sum_left, n_left] = PoissonHalfDeviance(left_data); + auto [right_phd, label_sum_right, n_right] = PoissonHalfDeviance(right_data); auto gain = parent_phd - ((n_left / n) * left_phd + (n_right / n) * right_phd); // gain in long form without proxy @@ -838,37 +859,32 @@ class ObjectiveTest : public ::testing::TestWithParam { return gain; } - auto GiniImpurity(std::vector const& hist) + auto GiniImpurity(std::vector const& data) { // sum((n_c/n_total)(1-(n_c/n_total))) - auto gini{double(0)}; - auto n_bins = hist.size() / params.n_classes; - auto n_instances = std::accumulate(hist.begin(), hist.end(), BinT()).x; // total instances + double gini(0); for (auto c = 0; c < params.n_classes; ++c) { - auto begin_iter = hist.begin() + c * n_bins; - auto end_iter = hist.begin() + (c + 1) * n_bins; - double class_proba = std::accumulate(begin_iter, end_iter, BinT()).x; // instances of class c - class_proba /= n_instances; // probability of class c + IdxT sum(0); + std::for_each(data.begin(), data.end(), [&](auto d) { + if (d == DataT(c)) ++sum; + }); + double class_proba = double(sum) / data.size(); gini += class_proba * (1 - class_proba); // adding gain } - return std::make_pair(gini, double(n_instances)); + return gini; } - auto GiniGroundTruthGain(std::vector const& pdf_hist, std::size_t const split_bin_index) + auto GiniGroundTruthGain(std::vector const& data, std::size_t const split_bin_index) { - std::vector left_pdf_hist, right_pdf_hist; + auto bin_width = raft::ceildiv(params.n_rows, params.n_bins); + std::vector left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width); + std::vector right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end()); - for (auto c = 0; c < params.n_classes; ++c) { // decompose the pdf_hist - auto start = pdf_hist.begin() + c * params.n_bins; - auto split = pdf_hist.begin() + c * params.n_bins + split_bin_index + 1; - auto end = pdf_hist.begin() + (c + 1) * params.n_bins; - - left_pdf_hist.insert(left_pdf_hist.end(), start, split); - right_pdf_hist.insert(right_pdf_hist.end(), split, end); - } - - auto [parent_gini, n] = GiniImpurity(pdf_hist); - auto [left_gini, left_n] = GiniImpurity(left_pdf_hist); - auto [right_gini, right_n] = GiniImpurity(right_pdf_hist); + auto parent_gini = GiniImpurity(data); + auto left_gini = GiniImpurity(left_data); + auto right_gini = GiniImpurity(right_data); + double n = data.size(); + double left_n = left_data.size(); + double right_n = right_data.size(); auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); @@ -880,24 +896,25 @@ class ObjectiveTest : public ::testing::TestWithParam { } } - auto GroundTruthGain(std::vector const& pdf_hist, std::size_t const split_bin_index) + auto GroundTruthGain(std::vector const& data, std::size_t const split_bin_index) { if constexpr (std::is_same>::value) // poisson { - return PoissonGroundTruthGain(pdf_hist, split_bin_index); + return PoissonGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same>::value) // gini { - return GammaGroundTruthGain(pdf_hist, split_bin_index); - } else if constexpr (std::is_same>::value) // gini + return GammaGroundTruthGain(data, split_bin_index); + } else if constexpr (std::is_same< + ObjectiveT, + InverseGaussianObjectiveFunction>::value) // gini { - return InverseGaussianGroundTruthGain(pdf_hist, split_bin_index); + return InverseGaussianGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same>::value) // gini { - return GiniGroundTruthGain(pdf_hist, split_bin_index); + return GiniGroundTruthGain(data, split_bin_index); } return double(0.0); } @@ -923,10 +940,10 @@ class ObjectiveTest : public ::testing::TestWithParam { params = ::testing::TestWithParam::GetParam(); ObjectiveT objective(params.n_classes, params.min_samples_leaf); - auto [cdf_hist, pdf_hist] = GenHist(); - - auto split_bin_index = RandUnder(params.n_bins); - auto ground_truth_gain = GroundTruthGain(pdf_hist, split_bin_index); + auto data = GenRandomData(); + auto [cdf_hist, pdf_hist] = GenHist(data); + auto split_bin_index = RandUnder(params.n_bins); + auto ground_truth_gain = GroundTruthGain(data, split_bin_index); auto hypothesis_gain = objective.GainPerSplit(&cdf_hist[0], split_bin_index, @@ -939,31 +956,31 @@ class ObjectiveTest : public ::testing::TestWithParam { }; const std::vector poisson_objective_test_parameters = { - {9507819643927052255LLU, 64, 1, 0, 0.00001}, - {9507819643927052259LLU, 128, 1, 1, 0.00001}, - {9507819643927052251LLU, 256, 1, 1, 0.00001}, - {9507819643927052258LLU, 512, 1, 5, 0.00001}, + {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001}, }; const std::vector gamma_objective_test_parameters = { - {9507819643927052255LLU, 64, 1, 0, 0.00001}, - {9507819643927052259LLU, 128, 1, 1, 0.00001}, - {9507819643927052251LLU, 256, 1, 1, 0.00001}, - {9507819643927052258LLU, 512, 1, 5, 0.00001}, + {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001}, }; const std::vector invgauss_objective_test_parameters = { - {9507819643927052255LLU, 64, 1, 0, 0.00001}, - {9507819643927052259LLU, 128, 1, 1, 0.00001}, - {9507819643927052251LLU, 256, 1, 1, 0.00001}, - {9507819643927052258LLU, 512, 1, 5, 0.00001}, + {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001}, }; const std::vector gini_objective_test_parameters = { - {9507819643927052255LLU, 64, 2, 0, 0.00001}, - {9507819643927052256LLU, 128, 10, 1, 0.00001}, - {9507819643927052257LLU, 256, 100, 1, 0.00001}, - {9507819643927052258LLU, 512, 100, 5, 0.00001}, + {9507819643927052255LLU, 2048, 64, 2, 0, 0.00001}, + {9507819643927052256LLU, 2048, 128, 10, 1, 0.00001}, + {9507819643927052257LLU, 2048, 256, 100, 1, 0.00001}, + {9507819643927052258LLU, 2048, 512, 100, 5, 0.00001}, }; // poisson objective test @@ -979,7 +996,8 @@ INSTANTIATE_TEST_CASE_P(RfTests, GammaObjectiveTestD, ::testing::ValuesIn(gamma_objective_test_parameters)); // InvGauss objective test -typedef ObjectiveTest> InverseGaussianObjectiveTestD; +typedef ObjectiveTest> + InverseGaussianObjectiveTestD; TEST_P(InverseGaussianObjectiveTestD, InverseGaussianObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, InverseGaussianObjectiveTestD, From 78b0ffde15475c0e30c381170358b000ac46233a Mon Sep 17 00:00:00 2001 From: venkywonka Date: Fri, 24 Sep 2021 09:16:02 +0530 Subject: [PATCH 28/42] add tests for entropy and mse --- .../batched-levelalgo/metrics.cuh | 15 +- cpp/test/sg/rf_test.cu | 252 +++++++++--------- .../dask/ensemble/randomforestclassifier.py | 13 +- .../dask/ensemble/randomforestregressor.py | 2 + python/cuml/ensemble/randomforest_common.pyx | 5 +- python/cuml/ensemble/randomforest_shared.pxd | 2 + .../cuml/ensemble/randomforestregressor.pyx | 2 + 7 files changed, 142 insertions(+), 149 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index dfa11a3ebf..974eaa3206 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -100,7 +100,7 @@ class GiniObjectiveFunction { HDI DataT GainPerSplit(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - auto nRight = len - nLeft; + IdxT nRight = len - nLeft; constexpr DataT One = DataT(1.0); auto invlen = One / len; auto invLeft = One / nLeft; @@ -175,7 +175,7 @@ class EntropyObjectiveFunction { HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) { - auto nRight{len - nLeft}; + IdxT nRight{len - nLeft}; auto gain{DataT(0.0)}; // if there aren't enough samples in this split, don't bother! if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { @@ -280,7 +280,7 @@ class MSEObjectiveFunction : public TweedieObjectiveFunctionmin_samples_leaf || nRight < this->min_samples_leaf) { @@ -292,7 +292,7 @@ class MSEObjectiveFunction : public TweedieObjectiveFunctionmin_samples_leaf || nRight < this->min_samples_leaf) @@ -392,6 +392,7 @@ class GammaObjectiveFunction : public TweedieObjectiveFunctionmin_samples_leaf || nRight < this->min_samples_leaf) return -std::numeric_limits::max(); @@ -443,7 +444,7 @@ class InverseGaussianObjectiveFunction : public TweedieObjectiveFunctionmin_samples_leaf || nRight < this->min_samples_leaf) @@ -462,7 +463,7 @@ class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction { void SetUp() override { RfTestParams params = ::testing::TestWithParam::GetParam(); - bool is_regression = params.split_criterion == MSE or params.split_criterion == MAE or - params.split_criterion == POISSON; + bool is_regression = params.split_criterion != GINI and params.split_criterion != ENTROPY; if (params.double_precision) { if (is_regression) { RfSpecialisedTest test(params); @@ -487,6 +485,8 @@ std::vector min_samples_split = {2, 10}; std::vector min_impurity_decrease = {0.0f, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; std::vector split_criterion = { + CRITERION::INVERSE_GAUSSIAN, + CRITERION::GAMMA, CRITERION::POISSON, CRITERION::MSE, CRITERION::GINI, @@ -670,10 +670,7 @@ namespace DT { struct ObjectiveTestParameters { uint64_t seed; -<<<<<<< HEAD int n_rows; -======= ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 int n_bins; int n_classes; int min_samples_leaf; @@ -690,8 +687,7 @@ class ObjectiveTest : public ::testing::TestWithParam { ObjectiveTestParameters params; public: -<<<<<<< HEAD - auto RandUnder(int const end = 100000) { return rand() % end; } + auto RandUnder(int const end = 10000) { return rand() % end; } auto GenRandomData() { @@ -699,41 +695,35 @@ class ObjectiveTest : public ::testing::TestWithParam { std::vector data(params.n_rows); if constexpr (std::is_same::value) // classification case { - for (auto& iter : data) { - iter = RandUnder(params.n_classes); + for (auto& d : data) { + d = RandUnder(params.n_classes); } } else { std::normal_distribution normal(1.0, 2.0); - for (auto& iter : data) { + for (auto& d : data) { auto rand_element(DataT(0)); while (1) { rand_element = normal(rng); if (rand_element > 0) break; // only positive random numbers } - iter = rand_element; + d = rand_element; } } return data; } auto GenHist(std::vector data) -======= - auto RandUnder(int const end = 10000) { return rand() % end; } - - auto GenHist() ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 { std::vector cdf_hist, pdf_hist; for (auto c = 0; c < params.n_classes; ++c) { for (auto b = 0; b < params.n_bins; ++b) { -<<<<<<< HEAD IdxT bin_width = raft::ceildiv(params.n_rows, params.n_bins); auto data_begin = data.begin() + b * bin_width; auto data_end = data_begin + bin_width; if constexpr (std::is_same::value) { // classification case auto count(IdxT(0)); - std::for_each(data_begin, data_end, [&](auto d) { + std::for_each(data_begin, data_end, [&](auto d){ if (d == c) ++count; }); pdf_hist.emplace_back(count); @@ -745,17 +735,6 @@ class ObjectiveTest : public ::testing::TestWithParam { auto cumulative = b > 0 ? cdf_hist.back() : BinT(); cdf_hist.emplace_back(pdf_hist.empty() ? BinT() : pdf_hist.back()); -======= - if constexpr (std::is_same::value) - pdf_hist.emplace_back(RandUnder()); - else - pdf_hist.emplace_back(static_cast(RandUnder()), RandUnder()); - - auto cumulative = b > 0 ? cdf_hist.back() : BinT(); - - cdf_hist.emplace_back(pdf_hist.empty() ? BinT() : pdf_hist.back()); - ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 cdf_hist.back() += cumulative; } } @@ -763,7 +742,44 @@ class ObjectiveTest : public ::testing::TestWithParam { return std::make_pair(cdf_hist, pdf_hist); } -<<<<<<< HEAD + + auto MSE( + std::vector const& + data) // 1/n * 1/2 * sum((y - y_pred) * (y - y_pred)) + { + DataT sum = std::accumulate(data.begin(), data.end(), DataT(0)); + DataT const mean = sum / data.size(); + auto mse{DataT(0.0)}; // mse: mean squared error + + std::for_each(data.begin(), data.end(), [&](auto d) { + mse += (d - mean) * (d - mean); // unit deviance + }); + + mse /= 2 * data.size(); + return std::make_tuple(mse, sum, DataT(data.size())); + } + + auto MSEGroundTruthGain(std::vector const& data, std::size_t split_bin_index) + { + auto bin_width = raft::ceildiv(params.n_rows, params.n_bins); + std::vector left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width); + std::vector right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end()); + + auto [parent_mse, label_sum, n] = MSE(data); + auto [left_mse, label_sum_left, n_left] = MSE(left_data); + auto [right_mse, label_sum_right, n_right] = MSE(right_data); + + auto gain = parent_mse - + ((n_left / n) * left_mse + // the minimizing objective function is half deviance + (n_right / n) * right_mse); // gain in long form without proxy + + // edge cases + if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf) + return -std::numeric_limits::max(); + else + return gain; + } + auto InverseGaussianHalfDeviance( std::vector const& data) // 1/n * 2 * sum((y - y_pred) * (y - y_pred)/(y * (y_pred) * (y_pred))) @@ -776,7 +792,7 @@ class ObjectiveTest : public ::testing::TestWithParam { ighd += (d - mean) * (d - mean) / (d * mean * mean); // unit deviance }); - ighd /= data.size(); + ighd /= 2 * data.size(); return std::make_tuple(ighd, sum, DataT(data.size())); } @@ -868,35 +884,6 @@ class ObjectiveTest : public ::testing::TestWithParam { auto [parent_phd, label_sum, n] = PoissonHalfDeviance(data); auto [left_phd, label_sum_left, n_left] = PoissonHalfDeviance(left_data); auto [right_phd, label_sum_right, n_right] = PoissonHalfDeviance(right_data); -======= - auto PoissonHalfDeviance( - std::vector const& hist) // 1/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) - { - BinT aggregate{BinT()}; - aggregate = std::accumulate(hist.begin(), hist.end(), aggregate); - assert(aggregate.count > 0); - auto const y_mean = aggregate.label_sum / aggregate.count; - auto poisson_half_deviance{DataT(0.0)}; - - std::for_each(hist.begin(), hist.end(), [&](BinT const& h) { - auto log_y = raft::myLog(h.label_sum ? h.label_sum : DataT(1.0)); // we don't want nans - poisson_half_deviance += h.label_sum * (log_y - raft::myLog(y_mean)) + y_mean - h.label_sum; - }); - - poisson_half_deviance /= aggregate.count; - return std::make_tuple( - poisson_half_deviance, aggregate.label_sum, static_cast(aggregate.count)); - } - - auto PoissonGroundTruthGain(std::vector const& pdf_hist, std::size_t split_bin_index) - { - std::vector left_pdf_hist{pdf_hist.begin(), pdf_hist.begin() + split_bin_index + 1}; - std::vector right_pdf_hist{pdf_hist.begin() + split_bin_index + 1, pdf_hist.end()}; - - auto [parent_phd, label_sum, n] = PoissonHalfDeviance(pdf_hist); - auto [left_phd, label_sum_left, n_left] = PoissonHalfDeviance(left_pdf_hist); - auto [right_phd, label_sum_right, n_right] = PoissonHalfDeviance(right_pdf_hist); ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 auto gain = parent_phd - ((n_left / n) * left_phd + (n_right / n) * right_phd); // gain in long form without proxy @@ -910,7 +897,43 @@ class ObjectiveTest : public ::testing::TestWithParam { return gain; } -<<<<<<< HEAD + auto Entropy(std::vector const& data) + { // sum((n_c/n_total)*(log(n_c/n_total))) + double entropy(0); + for (auto c = 0; c < params.n_classes; ++c) { + IdxT sum(0); + std::for_each(data.begin(), data.end(), [&](auto d) { + if (d == DataT(c)) ++sum; + }); + double class_proba = double(sum) / data.size(); + entropy += -class_proba * raft::myLog(class_proba ? class_proba : DataT(1)) / raft::myLog(DataT(2)); // adding gain + } + return entropy; + } + + auto EntropyGroundTruthGain(std::vector const& data, std::size_t const split_bin_index) + { + auto bin_width = raft::ceildiv(params.n_rows, params.n_bins); + std::vector left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width); + std::vector right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end()); + + auto parent_entropy = Entropy(data); + auto left_entropy = Entropy(left_data); + auto right_entropy = Entropy(right_data); + double n = data.size(); + double left_n = left_data.size(); + double right_n = right_data.size(); + + auto gain = parent_entropy - ((left_n / n) * left_entropy + (right_n / n) * right_entropy); + + // edge cases + if (left_n < params.min_samples_leaf or right_n < params.min_samples_leaf) { + return -std::numeric_limits::max(); + } else { + return gain; + } + } + auto GiniImpurity(std::vector const& data) { // sum((n_c/n_total)(1-(n_c/n_total))) double gini(0); @@ -937,39 +960,6 @@ class ObjectiveTest : public ::testing::TestWithParam { double n = data.size(); double left_n = left_data.size(); double right_n = right_data.size(); -======= - auto GiniImpurity(std::vector const& hist) - { // sum((n_c/n_total)(1-(n_c/n_total))) - auto gini{double(0)}; - auto n_bins = hist.size() / params.n_classes; - auto n_instances = std::accumulate(hist.begin(), hist.end(), BinT()).x; // total instances - for (auto c = 0; c < params.n_classes; ++c) { - auto begin_iter = hist.begin() + c * n_bins; - auto end_iter = hist.begin() + (c + 1) * n_bins; - double class_proba = std::accumulate(begin_iter, end_iter, BinT()).x; // instances of class c - class_proba /= n_instances; // probability of class c - gini += class_proba * (1 - class_proba); // adding gain - } - return std::make_pair(gini, double(n_instances)); - } - - auto GiniGroundTruthGain(std::vector const& pdf_hist, std::size_t const split_bin_index) - { - std::vector left_pdf_hist, right_pdf_hist; - - for (auto c = 0; c < params.n_classes; ++c) { // decompose the pdf_hist - auto start = pdf_hist.begin() + c * params.n_bins; - auto split = pdf_hist.begin() + c * params.n_bins + split_bin_index + 1; - auto end = pdf_hist.begin() + (c + 1) * params.n_bins; - - left_pdf_hist.insert(left_pdf_hist.end(), start, split); - right_pdf_hist.insert(right_pdf_hist.end(), split, end); - } - - auto [parent_gini, n] = GiniImpurity(pdf_hist); - auto [left_gini, left_n] = GiniImpurity(left_pdf_hist); - auto [right_gini, right_n] = GiniImpurity(right_pdf_hist); ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); @@ -981,37 +971,33 @@ class ObjectiveTest : public ::testing::TestWithParam { } } -<<<<<<< HEAD auto GroundTruthGain(std::vector const& data, std::size_t const split_bin_index) -======= - auto GroundTruthGain(std::vector const& pdf_hist, std::size_t const split_bin_index) ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 { if constexpr (std::is_same>::value) // mean squared error + { + return MSEGroundTruthGain(data, split_bin_index); + } else if constexpr (std::is_same>::value) // poisson { -<<<<<<< HEAD return PoissonGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same>::value) // gini + GammaObjectiveFunction>::value) // gamma { return GammaGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same< ObjectiveT, - InverseGaussianObjectiveFunction>::value) // gini + InverseGaussianObjectiveFunction>::value) // inverse gaussian { return InverseGaussianGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same>::value) // gini + EntropyObjectiveFunction>::value) // entropy { - return GiniGroundTruthGain(data, split_bin_index); -======= - return PoissonGroundTruthGain(pdf_hist, split_bin_index); + return EntropyGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same>::value) // gini { - return GiniGroundTruthGain(pdf_hist, split_bin_index); ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 + return GiniGroundTruthGain(data, split_bin_index); } return double(0.0); } @@ -1037,17 +1023,10 @@ class ObjectiveTest : public ::testing::TestWithParam { params = ::testing::TestWithParam::GetParam(); ObjectiveT objective(params.n_classes, params.min_samples_leaf); -<<<<<<< HEAD auto data = GenRandomData(); auto [cdf_hist, pdf_hist] = GenHist(data); auto split_bin_index = RandUnder(params.n_bins); auto ground_truth_gain = GroundTruthGain(data, split_bin_index); -======= - auto [cdf_hist, pdf_hist] = GenHist(); - - auto split_bin_index = RandUnder(params.n_bins); - auto ground_truth_gain = GroundTruthGain(pdf_hist, split_bin_index); ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 auto hypothesis_gain = objective.GainPerSplit(&cdf_hist[0], split_bin_index, @@ -1059,8 +1038,14 @@ class ObjectiveTest : public ::testing::TestWithParam { } }; +const std::vector mse_objective_test_parameters = { + {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001}, + {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001}, + {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001}, + {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001}, +}; + const std::vector poisson_objective_test_parameters = { -<<<<<<< HEAD {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001}, {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001}, {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001}, @@ -1081,32 +1066,32 @@ const std::vector invgauss_objective_test_parameters = {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001}, }; -const std::vector gini_objective_test_parameters = { +const std::vector entropy_objective_test_parameters = { {9507819643927052255LLU, 2048, 64, 2, 0, 0.00001}, {9507819643927052256LLU, 2048, 128, 10, 1, 0.00001}, {9507819643927052257LLU, 2048, 256, 100, 1, 0.00001}, {9507819643927052258LLU, 2048, 512, 100, 5, 0.00001}, -======= - {9507819643927052255LLU, 64, 1, 0, 0.00001}, - {9507819643927052259LLU, 128, 1, 1, 0.00001}, - {9507819643927052251LLU, 256, 1, 1, 0.00001}, - {9507819643927052258LLU, 512, 1, 5, 0.00001}, }; + const std::vector gini_objective_test_parameters = { - {9507819643927052255LLU, 64, 2, 0, 0.00001}, - {9507819643927052256LLU, 128, 10, 1, 0.00001}, - {9507819643927052257LLU, 256, 100, 1, 0.00001}, - {9507819643927052258LLU, 512, 100, 5, 0.00001}, ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 + {9507819643927052255LLU, 2048, 64, 2, 0, 0.00001}, + {9507819643927052256LLU, 2048, 128, 10, 1, 0.00001}, + {9507819643927052257LLU, 2048, 256, 100, 1, 0.00001}, + {9507819643927052258LLU, 2048, 512, 100, 5, 0.00001}, }; +// mse objective test +typedef ObjectiveTest> MSEObjectiveTestD; +TEST_P(MSEObjectiveTestD, MSEObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + MSEObjectiveTestD, + ::testing::ValuesIn(mse_objective_test_parameters)); // poisson objective test typedef ObjectiveTest> PoissonObjectiveTestD; TEST_P(PoissonObjectiveTestD, poissonObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, PoissonObjectiveTestD, ::testing::ValuesIn(poisson_objective_test_parameters)); -<<<<<<< HEAD // gamma objective test typedef ObjectiveTest> GammaObjectiveTestD; TEST_P(GammaObjectiveTestD, GammaObjectiveTest) {} @@ -1120,8 +1105,13 @@ TEST_P(InverseGaussianObjectiveTestD, InverseGaussianObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, InverseGaussianObjectiveTestD, ::testing::ValuesIn(invgauss_objective_test_parameters)); -======= ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 + +// entropy objective test +typedef ObjectiveTest> EntropyObjectiveTestD; +TEST_P(EntropyObjectiveTestD, entropyObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + EntropyObjectiveTestD, + ::testing::ValuesIn(entropy_objective_test_parameters)); // gini objective test typedef ObjectiveTest> GiniObjectiveTestD; diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index ff33880e81..c867f63841 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -74,21 +74,14 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, run different models concurrently in different streams by creating handles in several streams. If it is None, a new one is created. -<<<<<<< HEAD - split_criterion : int (default = 0) - The criterion used to split nodes. - 0 for GINI, 1 for ENTROPY, 5 for CRITERION_END. - 2 and 3 not valid for classification - split_algo : 0 for HIST and 1 for GLOBAL_QUANTILE (default = 1) - the algorithm to determine how nodes are split in the tree. -======= split_criterion : int or string (default = 0 ('gini')) The criterion used to split nodes. 0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY, 2 or 'mse' for MSE, 4 or 'poisson' for POISSON, - 2, 'mse', 4, 'poisson' not valid for classification ->>>>>>> d0aaafc51703cbe7efca995f495f7ab9731c9dd0 + 5 or 'gamma' for GAMMA, + 6 or 'inverse_gaussian' for INVERSE_GAUSSIAN, + 2, 'mse', 4, 'poisson', 5, 'gamma', 6, 'inverse_gaussian' not valid for classification bootstrap : boolean (default = True) Control bootstrapping. If set, each tree in the forest is built diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index 846e1cc344..328484d6ae 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -72,6 +72,8 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin, 0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY, 2 or 'mse' for MSE, 4 or 'poisson' for POISSON, + 5 or 'gamma' for GAMMA, + 6 or 'inverse_gaussian' for INVERSE_GAUSSIAN, 0, 'gini', 1, 'entropy' not valid for regression bootstrap : boolean (default = True) Control bootstrapping. diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 264aafa084..ad43dc07b6 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -58,7 +58,10 @@ class BaseRandomForestModel(Base): '2': MSE, 'mse': MSE, '3': MAE, 'mae': MAE, '4': POISSON, 'poisson': POISSON, - '5': CRITERION_END} + '5': GAMMA, 'gamma': GAMMA, + '6': INVERSE_GAUSSIAN, + 'inverse_gaussian': INVERSE_GAUSSIAN, + '6': CRITERION_END} classes_ = CumlArrayDescriptor() diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 389eec5a45..638b1d7a10 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -43,6 +43,8 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": MSE, MAE, POISSON, + GAMMA, + INVERSE_GAUSSIAN, CRITERION_END cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index fdb4c9f369..0ede37da66 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -166,6 +166,8 @@ class RandomForestRegressor(BaseRandomForestModel, 0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY, 2 or 'mse' for MSE, 4 or 'poisson' for POISSON, + 5 or 'gamma' for GAMMA, + 6 or 'inverse_gaussian' for INVERSE_GAUSSIAN, 0, 'gini', 1, 'entropy' not valid for regression. bootstrap : boolean (default = True) Control bootstrapping. From 11b2f4e375db5ade05bd7276d7a1b047aa57969c Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 4 Oct 2021 18:08:06 +0530 Subject: [PATCH 29/42] add python tests and refactor objectives --- .../batched-levelalgo/metrics.cuh | 140 +++++++++++------- cpp/test/sg/rf_test.cu | 57 +++++-- python/cuml/test/test_random_forest.py | 43 ++++-- 3 files changed, 161 insertions(+), 79 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 974eaa3206..2962b6385d 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -236,44 +236,17 @@ class EntropyObjectiveFunction { } }; -/** @brief The base class for the tweedie family of objective functions: - * mean-squared-error(p=0), poisson(p=1), gamma(p=2) and inverse gaussian(p=3) - **/ template -class TweedieObjectiveFunction { +class MSEObjectiveFunction { public: using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; - using BinT = AggregateBin; - - protected: - IdxT min_samples_leaf; - - public: - HDI TweedieObjectiveFunction(IdxT min_samples_leaf) : min_samples_leaf(min_samples_leaf) {} - - DI IdxT NumClasses() const { return 1; } - - static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) - { - for (int i = 0; i < nclasses; i++) { - out[i] = shist[i].label_sum / shist[i].count; - } - } -}; - -template -class MSEObjectiveFunction : public TweedieObjectiveFunction { - public: - using DataT = DataT_; - using LabelT = LabelT_; - using IdxT = IdxT_; - // using BinT = typename TweedieObjectiveFunction::BinT; using BinT = AggregateBin; + IdxT min_samples_leaf; - HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - : TweedieObjectiveFunction{min_samples_leaf} + HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) : + min_samples_leaf(min_samples_leaf) { } @@ -283,16 +256,16 @@ class MSEObjectiveFunction : public TweedieObjectiveFunctionmin_samples_leaf || nRight < this->min_samples_leaf) { + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { return -std::numeric_limits::max(); } else { auto label_sum = hist[nbins - 1].label_sum; - DataT parent_obj = -label_sum * label_sum / len; + DataT parent_obj = -label_sum * label_sum * invLen; DataT left_obj = -(hist[i].label_sum * hist[i].label_sum) / nLeft; DataT right_label_sum = hist[i].label_sum - label_sum; DataT right_obj = -(right_label_sum * right_label_sum) / nRight; gain = parent_obj - (left_obj + right_obj); - gain *= 0.5 * invLen; + gain *= DataT(0.5) * invLen; return gain; } @@ -304,25 +277,35 @@ class MSEObjectiveFunction : public TweedieObjectiveFunction sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } + + DI IdxT NumClasses() const { return 1; } + + static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) + { + for (int i = 0; i < nclasses; i++) { + out[i] = shist[i].label_sum / shist[i].count; + } + } }; template -class PoissonObjectiveFunction : public TweedieObjectiveFunction { +class PoissonObjectiveFunction { public: using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; - // using BinT = typename TweedieObjectiveFunction::BinT; using BinT = AggregateBin; + IdxT min_samples_leaf; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - : TweedieObjectiveFunction{min_samples_leaf} + // : TweedieObjectiveFunction{min_samples_leaf} + : min_samples_leaf(min_samples_leaf) { } @@ -343,7 +326,7 @@ class PoissonObjectiveFunction : public TweedieObjectiveFunctionmin_samples_leaf || nRight < this->min_samples_leaf) + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); auto label_sum = hist[nbins - 1].label_sum; @@ -358,7 +341,7 @@ class PoissonObjectiveFunction : public TweedieObjectiveFunction sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } + + DI IdxT NumClasses() const { return 1; } + + static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) + { + for (int i = 0; i < nclasses; i++) { + out[i] = shist[i].label_sum / shist[i].count; + } + } }; template -class GammaObjectiveFunction : public TweedieObjectiveFunction { +class GammaObjectiveFunction { public: using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + IdxT min_samples_leaf; HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - : TweedieObjectiveFunction{min_samples_leaf} + : min_samples_leaf{min_samples_leaf} { } + /** + * @brief compute the gamma impurity reduction (or purity gain) for each split + * + * @note This method is used to speed up the search for the best split + * by calculating the gain using a proxy gamma half deviance reduction. + * It is a proxy quantity such that the split that maximizes this value + * also maximizes the impurity improvement. It neglects all constant terms + * of the impurity decrease for a given split. + * The Gain is the difference in the proxy impurities of the parent and the + * weighted sum of impurities of its children. + */ HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const { IdxT nRight = len - nLeft; // if there aren't enough samples in this split, don't bother! - if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); DataT label_sum = hist[nbins - 1].label_sum; @@ -406,11 +410,11 @@ class GammaObjectiveFunction : public TweedieObjectiveFunction::max(); // compute the gain to be - DataT parent_obj = raft::myLog(label_sum / len); - DataT left_obj = (DataT(nLeft) / DataT(len)) * raft::myLog(left_label_sum / nLeft); - DataT right_obj = (DataT(nRight) / DataT(len)) * raft::myLog(right_label_sum / nRight); + DataT parent_obj = len * raft::myLog(label_sum / len); + DataT left_obj = nLeft * raft::myLog(left_label_sum / nLeft); + DataT right_obj = nRight * raft::myLog(right_label_sum / nRight); DataT gain = parent_obj - (left_obj + right_obj); - // gain = gain / DataT(len); + gain = gain / DataT(len); return gain; } @@ -421,33 +425,53 @@ class GammaObjectiveFunction : public TweedieObjectiveFunction sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } + DI IdxT NumClasses() const { return 1; } + + static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) + { + for (int i = 0; i < nclasses; i++) { + out[i] = shist[i].label_sum / shist[i].count; + } + } }; template -class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction { +class InverseGaussianObjectiveFunction { public: using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + IdxT min_samples_leaf; HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - : TweedieObjectiveFunction{min_samples_leaf} + : min_samples_leaf{min_samples_leaf} { } + /** + * @brief compute the inverse gaussian impurity reduction (or purity gain) for each split + * + * @note This method is used to speed up the search for the best split + * by calculating the gain using a proxy inverse gaussian half deviance reduction. + * It is a proxy quantity such that the split that maximizes this value + * also maximizes the impurity improvement. It neglects all constant terms + * of the impurity decrease for a given split. + * The Gain is the difference in the proxy impurities of the parent and the + * weighted sum of impurities of its children. + */ HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const { // get the lens' IdxT nRight = len - nLeft; // if there aren't enough samples in this split, don't bother! - if (nLeft < this->min_samples_leaf || nRight < this->min_samples_leaf) + if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); auto label_sum = hist[nbins - 1].label_sum; @@ -462,7 +486,7 @@ class InverseGaussianObjectiveFunction : public TweedieObjectiveFunction sp; for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) { auto nLeft = shist[i].count; - sp.update({sbins[i], col, this->GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); + sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft}); } return sp; } + DI IdxT NumClasses() const { return 1; } + + static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out) + { + for (int i = 0; i < nclasses; i++) { + out[i] = shist[i].label_sum / shist[i].count; + } + } }; } // end namespace DT } // end namespace ML \ No newline at end of file diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index ed95bd3207..92e658c3bc 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -15,7 +15,6 @@ */ #include #include -#include #include #include @@ -899,13 +898,13 @@ class ObjectiveTest : public ::testing::TestWithParam { auto Entropy(std::vector const& data) { // sum((n_c/n_total)*(log(n_c/n_total))) - double entropy(0); + DataT entropy(0); for (auto c = 0; c < params.n_classes; ++c) { IdxT sum(0); std::for_each(data.begin(), data.end(), [&](auto d) { if (d == DataT(c)) ++sum; }); - double class_proba = double(sum) / data.size(); + DataT class_proba = DataT(sum) / data.size(); entropy += -class_proba * raft::myLog(class_proba ? class_proba : DataT(1)) / raft::myLog(DataT(2)); // adding gain } return entropy; @@ -920,9 +919,9 @@ class ObjectiveTest : public ::testing::TestWithParam { auto parent_entropy = Entropy(data); auto left_entropy = Entropy(left_data); auto right_entropy = Entropy(right_data); - double n = data.size(); - double left_n = left_data.size(); - double right_n = right_data.size(); + DataT n = data.size(); + DataT left_n = left_data.size(); + DataT right_n = right_data.size(); auto gain = parent_entropy - ((left_n / n) * left_entropy + (right_n / n) * right_entropy); @@ -936,13 +935,13 @@ class ObjectiveTest : public ::testing::TestWithParam { auto GiniImpurity(std::vector const& data) { // sum((n_c/n_total)(1-(n_c/n_total))) - double gini(0); + DataT gini(0); for (auto c = 0; c < params.n_classes; ++c) { IdxT sum(0); std::for_each(data.begin(), data.end(), [&](auto d) { if (d == DataT(c)) ++sum; }); - double class_proba = double(sum) / data.size(); + DataT class_proba = DataT(sum) / data.size(); gini += class_proba * (1 - class_proba); // adding gain } return gini; @@ -957,9 +956,9 @@ class ObjectiveTest : public ::testing::TestWithParam { auto parent_gini = GiniImpurity(data); auto left_gini = GiniImpurity(left_data); auto right_gini = GiniImpurity(right_data); - double n = data.size(); - double left_n = left_data.size(); - double right_n = right_data.size(); + DataT n = data.size(); + DataT left_n = left_data.size(); + DataT right_n = right_data.size(); auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); @@ -999,7 +998,7 @@ class ObjectiveTest : public ::testing::TestWithParam { { return GiniGroundTruthGain(data, split_bin_index); } - return double(0.0); + return DataT(0.0); } auto NumLeftOfBin(std::vector const& cdf_hist, IdxT idx) @@ -1086,18 +1085,36 @@ TEST_P(MSEObjectiveTestD, MSEObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, MSEObjectiveTestD, ::testing::ValuesIn(mse_objective_test_parameters)); +typedef ObjectiveTest> MSEObjectiveTestF; +TEST_P(MSEObjectiveTestF, MSEObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + MSEObjectiveTestF, + ::testing::ValuesIn(mse_objective_test_parameters)); + // poisson objective test typedef ObjectiveTest> PoissonObjectiveTestD; TEST_P(PoissonObjectiveTestD, poissonObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, PoissonObjectiveTestD, ::testing::ValuesIn(poisson_objective_test_parameters)); +typedef ObjectiveTest> PoissonObjectiveTestF; +TEST_P(PoissonObjectiveTestF, poissonObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + PoissonObjectiveTestF, + ::testing::ValuesIn(poisson_objective_test_parameters)); + // gamma objective test typedef ObjectiveTest> GammaObjectiveTestD; TEST_P(GammaObjectiveTestD, GammaObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, GammaObjectiveTestD, ::testing::ValuesIn(gamma_objective_test_parameters)); +typedef ObjectiveTest> GammaObjectiveTestF; +TEST_P(GammaObjectiveTestF, GammaObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + GammaObjectiveTestF, + ::testing::ValuesIn(gamma_objective_test_parameters)); + // InvGauss objective test typedef ObjectiveTest> InverseGaussianObjectiveTestD; @@ -1105,6 +1122,12 @@ TEST_P(InverseGaussianObjectiveTestD, InverseGaussianObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, InverseGaussianObjectiveTestD, ::testing::ValuesIn(invgauss_objective_test_parameters)); +typedef ObjectiveTest> + InverseGaussianObjectiveTestF; +TEST_P(InverseGaussianObjectiveTestF, InverseGaussianObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + InverseGaussianObjectiveTestF, + ::testing::ValuesIn(invgauss_objective_test_parameters)); // entropy objective test typedef ObjectiveTest> EntropyObjectiveTestD; @@ -1112,6 +1135,11 @@ TEST_P(EntropyObjectiveTestD, entropyObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, EntropyObjectiveTestD, ::testing::ValuesIn(entropy_objective_test_parameters)); +typedef ObjectiveTest> EntropyObjectiveTestF; +TEST_P(EntropyObjectiveTestF, entropyObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + EntropyObjectiveTestF, + ::testing::ValuesIn(entropy_objective_test_parameters)); // gini objective test typedef ObjectiveTest> GiniObjectiveTestD; @@ -1119,6 +1147,11 @@ TEST_P(GiniObjectiveTestD, giniObjectiveTest) {} INSTANTIATE_TEST_CASE_P(RfTests, GiniObjectiveTestD, ::testing::ValuesIn(gini_objective_test_parameters)); +typedef ObjectiveTest> GiniObjectiveTestF; +TEST_P(GiniObjectiveTestF, giniObjectiveTest) {} +INSTANTIATE_TEST_CASE_P(RfTests, + GiniObjectiveTestF, + ::testing::ValuesIn(gini_objective_test_parameters)); } // end namespace DT } // end namespace ML diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 9d1d7bb486..9eb3d7a46a 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -32,7 +32,7 @@ from sklearn.ensemble import RandomForestClassifier as skrfc from sklearn.ensemble import RandomForestRegressor as skrfr from sklearn.metrics import accuracy_score, mean_squared_error, \ - mean_poisson_deviance + mean_tweedie_deviance from sklearn.datasets import fetch_california_housing, \ make_classification, make_regression, load_iris, load_breast_cancer, \ load_boston @@ -187,21 +187,34 @@ def special_reg(request): return X, y -@pytest.mark.parametrize("lam", [0.01, 0.1]) @pytest.mark.parametrize("max_depth", [2, 4]) -def test_poisson_convergence(lam, max_depth): +@pytest.mark.parametrize("split_criterion", + ["poisson", "gamma", "inverse_gaussian"]) +def test_tweedie_convergence(max_depth, split_criterion): np.random.seed(33) bootstrap = None max_features = 1.0 n_estimators = 1 min_impurity_decrease = 1e-5 n_datapoints = 100000 - # generating random poisson dataset + tweedie = { + "poisson": + {"power": 1, + "gen": np.random.poisson, "args": [0.1]}, + "gamma": + {"power": 2, + "gen": np.random.gamma, "args": [1.0]}, + "inverse_gaussian": + {"power": 3, + "gen": np.random.wald, "args": [0.1, 1.0]} + } + # generating random dataset with tweedie distribution X = np.random.random((n_datapoints, 4)).astype(np.float32) - y = np.random.poisson(lam=lam, size=n_datapoints).astype(np.float32) + y = tweedie[split_criterion]["gen"](*tweedie[split_criterion]["args"], + size=n_datapoints).astype(np.float32) - poisson_preds = curfr( - split_criterion=4, + tweedie_preds = curfr( + split_criterion=split_criterion, max_depth=max_depth, n_estimators=n_estimators, bootstrap=bootstrap, @@ -216,12 +229,16 @@ def test_poisson_convergence(lam, max_depth): min_impurity_decrease=min_impurity_decrease).fit(X, y).predict(X) # y should not be non-positive for mean_poisson_deviance mask = mse_preds > 0 - mse_mpd = mean_poisson_deviance(y[mask], mse_preds[mask]) - poisson_mpd = mean_poisson_deviance(y, poisson_preds) - - # model trained on poisson data with - # poisson criterion must perform better on poisson loss - assert mse_mpd >= poisson_mpd + mse_tweedie_deviance = mean_tweedie_deviance(y[mask], + mse_preds[mask], + power=tweedie[split_criterion]["power"]) + tweedie_tweedie_deviance = mean_tweedie_deviance(y[mask], + tweedie_preds[mask], + power=tweedie[split_criterion]["power"]) + + # model trained on tweedie data with + # tweedie criterion must perform better on tweedie loss + assert mse_tweedie_deviance >= tweedie_tweedie_deviance @pytest.mark.parametrize( From 2fa43d7852b5ea7898a32d0139b77ae5db32d77a Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 4 Oct 2021 18:11:36 +0530 Subject: [PATCH 30/42] FIX clang format --- .../batched-levelalgo/metrics.cuh | 16 ++--- cpp/test/sg/rf_test.cu | 67 +++++++++---------- 2 files changed, 40 insertions(+), 43 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 2962b6385d..10a410dce8 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -242,11 +242,11 @@ class MSEObjectiveFunction { using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; - using BinT = AggregateBin; + using BinT = AggregateBin; IdxT min_samples_leaf; - HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) : - min_samples_leaf(min_samples_leaf) + HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) + : min_samples_leaf(min_samples_leaf) { } @@ -298,7 +298,7 @@ class PoissonObjectiveFunction { using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; - using BinT = AggregateBin; + using BinT = AggregateBin; IdxT min_samples_leaf; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); @@ -341,7 +341,7 @@ class PoissonObjectiveFunction { DataT parent_obj = -label_sum * raft::myLog(label_sum / len); DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); - DataT gain = parent_obj - (left_obj + right_obj); + DataT gain = parent_obj - (left_obj + right_obj); gain = gain / len; return gain; @@ -379,7 +379,7 @@ class GammaObjectiveFunction { IdxT min_samples_leaf; HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - : min_samples_leaf{min_samples_leaf} + : min_samples_leaf{min_samples_leaf} { } @@ -486,8 +486,8 @@ class InverseGaussianObjectiveFunction { DataT parent_obj = -DataT(len) * DataT(len) / label_sum; DataT left_obj = -DataT(nLeft) * DataT(nLeft) / left_label_sum; DataT right_obj = -DataT(nRight) * DataT(nRight) / right_label_sum; - DataT gain = parent_obj - (left_obj + right_obj); - gain = gain / (2 * len ); + DataT gain = parent_obj - (left_obj + right_obj); + gain = gain / (2 * len); return gain; } diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 92e658c3bc..dd9e62e2f3 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -450,7 +450,7 @@ class RfTest : public ::testing::TestWithParam { void SetUp() override { RfTestParams params = ::testing::TestWithParam::GetParam(); - bool is_regression = params.split_criterion != GINI and params.split_criterion != ENTROPY; + bool is_regression = params.split_criterion != GINI and params.split_criterion != ENTROPY; if (params.double_precision) { if (is_regression) { RfSpecialisedTest test(params); @@ -483,16 +483,15 @@ std::vector min_samples_leaf = {1, 10, 30}; std::vector min_samples_split = {2, 10}; std::vector min_impurity_decrease = {0.0f, 1.0f, 10.0f}; std::vector n_streams = {1, 2, 10}; -std::vector split_criterion = { - CRITERION::INVERSE_GAUSSIAN, - CRITERION::GAMMA, - CRITERION::POISSON, - CRITERION::MSE, - CRITERION::GINI, - CRITERION::ENTROPY}; -std::vector seed = {0, 17}; -std::vector n_labels = {2, 10, 20}; -std::vector double_precision = {false, true}; +std::vector split_criterion = {CRITERION::INVERSE_GAUSSIAN, + CRITERION::GAMMA, + CRITERION::POISSON, + CRITERION::MSE, + CRITERION::GINI, + CRITERION::ENTROPY}; +std::vector seed = {0, 17}; +std::vector n_labels = {2, 10, 20}; +std::vector double_precision = {false, true}; int n_tests = 100; @@ -722,7 +721,7 @@ class ObjectiveTest : public ::testing::TestWithParam { auto data_end = data_begin + bin_width; if constexpr (std::is_same::value) { // classification case auto count(IdxT(0)); - std::for_each(data_begin, data_end, [&](auto d){ + std::for_each(data_begin, data_end, [&](auto d) { if (d == c) ++count; }); pdf_hist.emplace_back(count); @@ -741,10 +740,7 @@ class ObjectiveTest : public ::testing::TestWithParam { return std::make_pair(cdf_hist, pdf_hist); } - - auto MSE( - std::vector const& - data) // 1/n * 1/2 * sum((y - y_pred) * (y - y_pred)) + auto MSE(std::vector const& data) // 1/n * 1/2 * sum((y - y_pred) * (y - y_pred)) { DataT sum = std::accumulate(data.begin(), data.end(), DataT(0)); DataT const mean = sum / data.size(); @@ -768,9 +764,9 @@ class ObjectiveTest : public ::testing::TestWithParam { auto [left_mse, label_sum_left, n_left] = MSE(left_data); auto [right_mse, label_sum_right, n_right] = MSE(right_data); - auto gain = parent_mse - - ((n_left / n) * left_mse + // the minimizing objective function is half deviance - (n_right / n) * right_mse); // gain in long form without proxy + auto gain = + parent_mse - ((n_left / n) * left_mse + // the minimizing objective function is half deviance + (n_right / n) * right_mse); // gain in long form without proxy // edge cases if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf) @@ -905,7 +901,8 @@ class ObjectiveTest : public ::testing::TestWithParam { if (d == DataT(c)) ++sum; }); DataT class_proba = DataT(sum) / data.size(); - entropy += -class_proba * raft::myLog(class_proba ? class_proba : DataT(1)) / raft::myLog(DataT(2)); // adding gain + entropy += -class_proba * raft::myLog(class_proba ? class_proba : DataT(1)) / + raft::myLog(DataT(2)); // adding gain } return entropy; } @@ -919,9 +916,9 @@ class ObjectiveTest : public ::testing::TestWithParam { auto parent_entropy = Entropy(data); auto left_entropy = Entropy(left_data); auto right_entropy = Entropy(right_data); - DataT n = data.size(); - DataT left_n = left_data.size(); - DataT right_n = right_data.size(); + DataT n = data.size(); + DataT left_n = left_data.size(); + DataT right_n = right_data.size(); auto gain = parent_entropy - ((left_n / n) * left_entropy + (right_n / n) * right_entropy); @@ -956,9 +953,9 @@ class ObjectiveTest : public ::testing::TestWithParam { auto parent_gini = GiniImpurity(data); auto left_gini = GiniImpurity(left_data); auto right_gini = GiniImpurity(right_data); - DataT n = data.size(); - DataT left_n = left_data.size(); - DataT right_n = right_data.size(); + DataT n = data.size(); + DataT left_n = left_data.size(); + DataT right_n = right_data.size(); auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini); @@ -972,25 +969,25 @@ class ObjectiveTest : public ::testing::TestWithParam { auto GroundTruthGain(std::vector const& data, std::size_t const split_bin_index) { - if constexpr (std::is_same>::value) // mean squared error + if constexpr (std::is_same>:: + value) // mean squared error { return MSEGroundTruthGain(data, split_bin_index); - } else if constexpr (std::is_same>::value) // poisson + } else if constexpr (std::is_same>:: + value) // poisson { return PoissonGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same>::value) // gamma { return GammaGroundTruthGain(data, split_bin_index); - } else if constexpr (std::is_same< - ObjectiveT, - InverseGaussianObjectiveFunction>::value) // inverse gaussian + } else if constexpr (std::is_same>:: + value) // inverse gaussian { return InverseGaussianGroundTruthGain(data, split_bin_index); - } else if constexpr (std::is_same>::value) // entropy + } else if constexpr (std::is_same>:: + value) // entropy { return EntropyGroundTruthGain(data, split_bin_index); } else if constexpr (std::is_same Date: Tue, 5 Oct 2021 14:45:37 +0530 Subject: [PATCH 31/42] reduce division operations --- .../batched-levelalgo/metrics.cuh | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 10a410dce8..0eca52dfcb 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -102,7 +102,7 @@ class GiniObjectiveFunction { { IdxT nRight = len - nLeft; constexpr DataT One = DataT(1.0); - auto invlen = One / len; + auto invLen = One / len; auto invLeft = One / nLeft; auto invRight = One / nRight; auto gain = DataT(0.0); @@ -115,16 +115,16 @@ class GiniObjectiveFunction { int val_i = 0; auto lval_i = hist[nbins * j + i].x; auto lval = DataT(lval_i); - gain += lval * invLeft * lval * invlen; + gain += lval * invLeft * lval * invLen; val_i += lval_i; auto total_sum = hist[nbins * j + nbins - 1].x; auto rval_i = total_sum - lval_i; auto rval = DataT(rval_i); - gain += rval * invRight * rval * invlen; + gain += rval * invRight * rval * invLen; val_i += rval_i; - auto val = DataT(val_i) * invlen; + auto val = DataT(val_i) * invLen; gain -= val * val; } @@ -254,7 +254,7 @@ class MSEObjectiveFunction { { auto gain{DataT(0)}; IdxT nRight{len - nLeft}; - auto invLen{DataT(1.0) / len}; + auto invLen = DataT(1.0) / len; // if there aren't enough samples in this split, don't bother! if (nLeft < min_samples_leaf || nRight < min_samples_leaf) { return -std::numeric_limits::max(); @@ -304,7 +304,6 @@ class PoissonObjectiveFunction { static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) - // : TweedieObjectiveFunction{min_samples_leaf} : min_samples_leaf(min_samples_leaf) { } @@ -324,6 +323,7 @@ class PoissonObjectiveFunction { { // get the lens' IdxT nRight = len - nLeft; + auto invLen = DataT(1) / len; // if there aren't enough samples in this split, don't bother! if (nLeft < min_samples_leaf || nRight < min_samples_leaf) @@ -338,11 +338,11 @@ class PoissonObjectiveFunction { return -std::numeric_limits::max(); // compute the gain to be - DataT parent_obj = -label_sum * raft::myLog(label_sum / len); + DataT parent_obj = -label_sum * raft::myLog(label_sum * invLen); DataT left_obj = -left_label_sum * raft::myLog(left_label_sum / nLeft); DataT right_obj = -right_label_sum * raft::myLog(right_label_sum / nRight); DataT gain = parent_obj - (left_obj + right_obj); - gain = gain / len; + gain = gain * invLen; return gain; } @@ -397,6 +397,8 @@ class GammaObjectiveFunction { HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const { IdxT nRight = len - nLeft; + auto invLen = DataT(1) / len; + // if there aren't enough samples in this split, don't bother! if (nLeft < min_samples_leaf || nRight < min_samples_leaf) return -std::numeric_limits::max(); @@ -410,11 +412,11 @@ class GammaObjectiveFunction { return -std::numeric_limits::max(); // compute the gain to be - DataT parent_obj = len * raft::myLog(label_sum / len); + DataT parent_obj = len * raft::myLog(label_sum * invLen); DataT left_obj = nLeft * raft::myLog(left_label_sum / nLeft); DataT right_obj = nRight * raft::myLog(right_label_sum / nRight); DataT gain = parent_obj - (left_obj + right_obj); - gain = gain / DataT(len); + gain = gain * invLen; return gain; } From 846462846d23fc3f85d9d7a9056744dbc65b59dd Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 5 Oct 2021 18:16:26 +0530 Subject: [PATCH 32/42] flake fix and change criterion_dict --- python/cuml/dask/ensemble/randomforestclassifier.py | 3 ++- python/cuml/ensemble/randomforest_common.pyx | 2 +- python/cuml/test/test_random_forest.py | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index c867f63841..39596a2823 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -81,7 +81,8 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, 4 or 'poisson' for POISSON, 5 or 'gamma' for GAMMA, 6 or 'inverse_gaussian' for INVERSE_GAUSSIAN, - 2, 'mse', 4, 'poisson', 5, 'gamma', 6, 'inverse_gaussian' not valid for classification + 2, 'mse', 4, 'poisson', 5, 'gamma', 6, 'inverse_gaussian' not valid + for classification bootstrap : boolean (default = True) Control bootstrapping. If set, each tree in the forest is built diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index ad43dc07b6..7e7a6b1dc8 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -61,7 +61,7 @@ class BaseRandomForestModel(Base): '5': GAMMA, 'gamma': GAMMA, '6': INVERSE_GAUSSIAN, 'inverse_gaussian': INVERSE_GAUSSIAN, - '6': CRITERION_END} + '7': CRITERION_END} classes_ = CumlArrayDescriptor() diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 9eb3d7a46a..ceb0407fef 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -231,10 +231,13 @@ def test_tweedie_convergence(max_depth, split_criterion): mask = mse_preds > 0 mse_tweedie_deviance = mean_tweedie_deviance(y[mask], mse_preds[mask], - power=tweedie[split_criterion]["power"]) + power=tweedie + [split_criterion]["power"]) tweedie_tweedie_deviance = mean_tweedie_deviance(y[mask], tweedie_preds[mask], - power=tweedie[split_criterion]["power"]) + power=tweedie + [split_criterion]["power"] + ) # model trained on tweedie data with # tweedie criterion must perform better on tweedie loss From d764562d14ba15b8d2085dc618fbe17a5ff47990 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 5 Oct 2021 18:30:06 +0530 Subject: [PATCH 33/42] make objective data members private --- .../decisiontree/batched-levelalgo/metrics.cuh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh index 0eca52dfcb..28c3c6c553 100644 --- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh @@ -86,6 +86,8 @@ class GiniObjectiveFunction { using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; + + private: IdxT nclasses; IdxT min_samples_leaf; @@ -162,6 +164,8 @@ class EntropyObjectiveFunction { using DataT = DataT_; using LabelT = LabelT_; using IdxT = IdxT_; + + private: IdxT nclasses; IdxT min_samples_leaf; @@ -243,8 +247,11 @@ class MSEObjectiveFunction { using LabelT = LabelT_; using IdxT = IdxT_; using BinT = AggregateBin; + + private: IdxT min_samples_leaf; + public: HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) : min_samples_leaf(min_samples_leaf) { @@ -299,8 +306,11 @@ class PoissonObjectiveFunction { using LabelT = LabelT_; using IdxT = IdxT_; using BinT = AggregateBin; + + private: IdxT min_samples_leaf; + public: static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) @@ -376,8 +386,11 @@ class GammaObjectiveFunction { using IdxT = IdxT_; using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + + private: IdxT min_samples_leaf; + public: HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) : min_samples_leaf{min_samples_leaf} { @@ -449,8 +462,11 @@ class InverseGaussianObjectiveFunction { using IdxT = IdxT_; using BinT = AggregateBin; static constexpr auto eps_ = 10 * std::numeric_limits::epsilon(); + + private: IdxT min_samples_leaf; + public: HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf) : min_samples_leaf{min_samples_leaf} { From 68ecabbb237801712215ee5aa1790d62e998caa0 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Wed, 6 Oct 2021 19:40:51 +0530 Subject: [PATCH 34/42] refactor declaration --- cpp/test/sg/rf_test.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index dd9e62e2f3..59aa2c29d3 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -699,7 +699,7 @@ class ObjectiveTest : public ::testing::TestWithParam { } else { std::normal_distribution normal(1.0, 2.0); for (auto& d : data) { - auto rand_element(DataT(0)); + auto rand_element{DataT(0)}; while (1) { rand_element = normal(rng); if (rand_element > 0) break; // only positive random numbers @@ -720,13 +720,13 @@ class ObjectiveTest : public ::testing::TestWithParam { auto data_begin = data.begin() + b * bin_width; auto data_end = data_begin + bin_width; if constexpr (std::is_same::value) { // classification case - auto count(IdxT(0)); + auto count{IdxT(0)}; std::for_each(data_begin, data_end, [&](auto d) { if (d == c) ++count; }); pdf_hist.emplace_back(count); } else { // regression case - auto label_sum(DataT(0)); + auto label_sum{DataT(0)}; label_sum = std::accumulate(data_begin, data_end, DataT(0)); pdf_hist.emplace_back(label_sum, bin_width); } From b1be698ed17f7c2152db1f148aaad46f120ee611 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Wed, 6 Oct 2021 21:13:51 +0530 Subject: [PATCH 35/42] fix improper merge --- cpp/src/decisiontree/decisiontree.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh index b4d114f372..c06d587539 100644 --- a/cpp/src/decisiontree/decisiontree.cuh +++ b/cpp/src/decisiontree/decisiontree.cuh @@ -310,6 +310,7 @@ class DecisionTree { .train(); } else if (params.split_criterion == CRITERION::GAMMA) { return Builder>(handle, + s, treeid, seed, params, @@ -323,6 +324,7 @@ class DecisionTree { .train(); } else if (params.split_criterion == CRITERION::INVERSE_GAUSSIAN) { return Builder>(handle, + s, treeid, seed, params, From d1e369d38786ed5671aa44d099c482f1c23e0eaa Mon Sep 17 00:00:00 2001 From: venkywonka Date: Mon, 11 Oct 2021 19:48:18 +0530 Subject: [PATCH 36/42] refactor new changes to docs --- docs/source/checkpoint.tl | Bin 0 -> 8920 bytes docs/source/conf.py | 1 + docs/source/kmeans_model.pkl | 1 + .../dask/ensemble/randomforestclassifier.py | 18 +++++++++--------- .../dask/ensemble/randomforestregressor.py | 18 +++++++++--------- .../cuml/ensemble/randomforestclassifier.pyx | 18 +++++++++--------- .../cuml/ensemble/randomforestregressor.pyx | 18 +++++++++--------- 7 files changed, 38 insertions(+), 36 deletions(-) create mode 100644 docs/source/checkpoint.tl create mode 100644 docs/source/kmeans_model.pkl diff --git a/docs/source/checkpoint.tl b/docs/source/checkpoint.tl new file mode 100644 index 0000000000000000000000000000000000000000..aaf4e9af43e595620a032cc9a92bbf2609669ed1 GIT binary patch literal 8920 zcmeHMziSjh6rP+>V~jrtidqO3+F00J6)?MNBUo7-*ocU_pcw0Dn#DqlMeKwN5d<;C zr4at{90s6k1{hZm!t+RV(OyDxGc{8(b-psuBee>ov4W-nGJSdIE zxUQ&Cw0hmUq=ska`s}sa_io;r%{7KkWHI)epTWcGEv3d(}n9k}Z*l zT`zv|!@2nM`={seZv4~BXzs{4E3R3L_}3|U=oyzDelUI_H{#l(e%Ty(d^gxJd($Jy zj)>->Yx>-zWbpc~)hao{x` z6rO3|Y_s{(ikp7VzvL{>H0hTd`ZnQvS5lv68qOT0R=xH^bC7ocS+AU#9};)nY;*ng znVF_PwtW4e{X_1Ed#t_Mb$H0MpFF=3ug{;1wBJ@mo+^wZ&gj=NNshVieO&k1r(N1{ z?ejju`w7nq;|D#0@7?j6-Nt&jCUO(p33=a!{k3k_id%74Kjcf*e1x+Czh{RfznD9T z;gI1KLu}sJ`Q_cr=jhC($pY7WcPCop9g1yA-l=$B;T?(RfX_bd(hluXmf1Ca(3+f= z<3h=Pxe3^-^1G=nci@{*d8%@uE|O$Ex^p@y+qX zE8Db3eYRamo^TxPn7!!{p}vLo%T2&%mER`V8}JOg)%mTV?XO$sR?#cWWufs#z96@* zbtjVIDol&(L5%pVb?3W*_7A=bf>*j_^3>GA_vxvH8&-_7*q1x#@iNZ%xZ<-kGDtPu|U>U- Date: Mon, 11 Oct 2021 20:03:31 +0530 Subject: [PATCH 37/42] prune artifacts --- docs/source/checkpoint.tl | Bin 8920 -> 0 bytes docs/source/conf.py | 1 - docs/source/kmeans_model.pkl | 1 - 3 files changed, 2 deletions(-) delete mode 100644 docs/source/checkpoint.tl delete mode 100644 docs/source/kmeans_model.pkl diff --git a/docs/source/checkpoint.tl b/docs/source/checkpoint.tl deleted file mode 100644 index aaf4e9af43e595620a032cc9a92bbf2609669ed1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8920 zcmeHMziSjh6rP+>V~jrtidqO3+F00J6)?MNBUo7-*ocU_pcw0Dn#DqlMeKwN5d<;C zr4at{90s6k1{hZm!t+RV(OyDxGc{8(b-psuBee>ov4W-nGJSdIE zxUQ&Cw0hmUq=ska`s}sa_io;r%{7KkWHI)epTWcGEv3d(}n9k}Z*l zT`zv|!@2nM`={seZv4~BXzs{4E3R3L_}3|U=oyzDelUI_H{#l(e%Ty(d^gxJd($Jy zj)>->Yx>-zWbpc~)hao{x` z6rO3|Y_s{(ikp7VzvL{>H0hTd`ZnQvS5lv68qOT0R=xH^bC7ocS+AU#9};)nY;*ng znVF_PwtW4e{X_1Ed#t_Mb$H0MpFF=3ug{;1wBJ@mo+^wZ&gj=NNshVieO&k1r(N1{ z?ejju`w7nq;|D#0@7?j6-Nt&jCUO(p33=a!{k3k_id%74Kjcf*e1x+Czh{RfznD9T z;gI1KLu}sJ`Q_cr=jhC($pY7WcPCop9g1yA-l=$B;T?(RfX_bd(hluXmf1Ca(3+f= z<3h=Pxe3^-^1G=nci@{*d8%@uE|O$Ex^p@y+qX zE8Db3eYRamo^TxPn7!!{p}vLo%T2&%mER`V8}JOg)%mTV?XO$sR?#cWWufs#z96@* zbtjVIDol&(L5%pVb?3W*_7A=bf>*j_^3>GA_vxvH8&-_7*q1x#@iNZ%xZ<-kGDtPu|U>U- Date: Tue, 19 Oct 2021 14:35:51 +0530 Subject: [PATCH 38/42] flake fix --- python/cuml/dask/ensemble/randomforestclassifier.py | 4 ++-- python/cuml/ensemble/randomforestclassifier.pyx | 2 +- python/cuml/ensemble/randomforestregressor.pyx | 2 +- source | 0 4 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 source diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 42821c5ee9..5a10820dfd 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -82,8 +82,8 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin, * ``4`` or ``'poisson'`` for poisson half deviance * ``5`` or ``'gamma'`` for gamma half deviance * ``6`` or ``'inverse_gaussian'`` for inverse gaussian deviance - ``2``, ``'mse'``, ``4``, ``'poisson'``, ``5``, ``'gamma'``, ``6``, ``'inverse_gaussian'`` not valid - for classification + ``2``, ``'mse'``, ``4``, ``'poisson'``, ``5``, ``'gamma'``, ``6``, + ``'inverse_gaussian'`` not valid for classification bootstrap : boolean (default = True) Control bootstrapping.\n * If ``True``, each tree in the forest is built on a bootstrapped diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index bb0ed414ec..6dc60a6646 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -222,7 +222,7 @@ class RandomForestClassifier(BaseRandomForestModel, verbose : int or boolean, default=False Sets logging level. It must be one of ``cuml.common.logger.level_*``. See :ref:`verbosity-levels` for more info. - output_type : ``{'input', 'cudf', 'cupy', 'numpy','numba'}`` (default=None) + output_type : ``{'input','cudf','cupy','numpy','numba'}`` (default=None) Variable to control output type of the results and attributes of the estimator. If None, it'll inherit the output type set at the module level, ``cuml.global_settings.output_type``. diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 23743711f2..8cb589c137 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -230,7 +230,7 @@ class RandomForestRegressor(BaseRandomForestModel, verbose : int or boolean, default=False Sets logging level. It must be one of ``cuml.common.logger.level_*``. See :ref:`verbosity-levels` for more info. - output_type : ``{'input', 'cudf', 'cupy', 'numpy', 'numba'}`` (default=None) + output_type : ``{'input','cudf','cupy','numpy','numba'}`` (default=None) Variable to control output type of the results and attributes of the estimator. If None, it'll inherit the output type set at the module level, ``cuml.global_settings.output_type``. diff --git a/source b/source new file mode 100644 index 0000000000..e69de29bb2 From 9ed70721643858b975df07ecc8fa74943c1d17f4 Mon Sep 17 00:00:00 2001 From: Venkat Date: Tue, 19 Oct 2021 14:37:27 +0530 Subject: [PATCH 39/42] Delete artifact --- source | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 source diff --git a/source b/source deleted file mode 100644 index e69de29bb2..0000000000 From 4ec326e5707d87787c02776a7c3b285104dcd212 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 19 Oct 2021 16:23:18 +0530 Subject: [PATCH 40/42] undo extra backtick causing test-fail --- python/cuml/ensemble/randomforestclassifier.pyx | 2 +- python/cuml/ensemble/randomforestregressor.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 6dc60a6646..82c9eeaadb 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -220,7 +220,7 @@ class RandomForestClassifier(BaseRandomForestModel, handles in several streams. If it is None, a new one is created. verbose : int or boolean, default=False - Sets logging level. It must be one of ``cuml.common.logger.level_*``. + Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. output_type : ``{'input','cudf','cupy','numpy','numba'}`` (default=None) Variable to control output type of the results and attributes of diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 8cb589c137..b756a78710 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -228,7 +228,7 @@ class RandomForestRegressor(BaseRandomForestModel, handles in several streams. If it is None, a new one is created. verbose : int or boolean, default=False - Sets logging level. It must be one of ``cuml.common.logger.level_*``. + Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. output_type : ``{'input','cudf','cupy','numpy','numba'}`` (default=None) Variable to control output type of the results and attributes of From 9e24756743d2686d6ff436e1dbbe718296f2eba1 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Tue, 19 Oct 2021 18:41:41 +0530 Subject: [PATCH 41/42] undo a cosmetic change due to a pytest dependence --- python/cuml/ensemble/randomforestclassifier.pyx | 4 ++-- python/cuml/ensemble/randomforestregressor.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 82c9eeaadb..ae49546a92 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -222,10 +222,10 @@ class RandomForestClassifier(BaseRandomForestModel, verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. - output_type : ``{'input','cudf','cupy','numpy','numba'}`` (default=None) + output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None Variable to control output type of the results and attributes of the estimator. If None, it'll inherit the output type set at the - module level, ``cuml.global_settings.output_type``. + module level, `cuml.global_settings.output_type`. See :ref:`output-data-type-configuration` for more info. Notes diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index b756a78710..f1df77fc45 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -230,10 +230,10 @@ class RandomForestRegressor(BaseRandomForestModel, verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. See :ref:`verbosity-levels` for more info. - output_type : ``{'input','cudf','cupy','numpy','numba'}`` (default=None) + output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None Variable to control output type of the results and attributes of the estimator. If None, it'll inherit the output type set at the - module level, ``cuml.global_settings.output_type``. + module level, `cuml.global_settings.output_type`. See :ref:`output-data-type-configuration` for more info. Notes From 3f8af469f01040a27684c3b479ac2d960aeed924 Mon Sep 17 00:00:00 2001 From: venkywonka Date: Wed, 20 Oct 2021 15:31:43 +0530 Subject: [PATCH 42/42] address review comments --- python/cuml/dask/ensemble/randomforestclassifier.py | 2 +- python/cuml/ensemble/randomforestclassifier.pyx | 13 ------------- python/cuml/ensemble/randomforestregressor.pyx | 5 ++--- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 5a10820dfd..b6de8b7d15 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -513,7 +513,7 @@ def predict_proba(self, X, Returns ------- - y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) + y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes) """ if self._get_internal_model() is None: self._set_internal_model(self._concat_treelite_models()) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index ae49546a92..a38f0ff772 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -240,19 +240,6 @@ class RandomForestClassifier(BaseRandomForestModel, during GPU inference. * While training the model for multi class classification problems, using deep trees or `max_features=1.0` provides better performance. - * Prediction of classes is currently different from how scikit-learn - predicts: - * scikit-learn predicts random forest classifiers by obtaining class - probabilities from each component tree, then averaging these class - probabilities over all the ensemble members, and finally resolving - to the label with highest probability as prediction. - * cuml random forest classifier prediction differs in that, each - component tree generates labels instead of class probabilities; - with the most frequent label over all the trees (the statistical - mode) resolved as prediction. - The above differences might cause marginal variations in accuracy in - tradeoff to better performance. - See: https://github.com/rapidsai/cuml/issues/3764 For additional docs, see `scikitlearn's RandomForestClassifier `_. diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index f1df77fc45..aa45af2543 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -129,11 +129,10 @@ class RandomForestRegressor(BaseRandomForestModel, .. code-block:: python import numpy as np - from cuml.test.utils import get_handle - from cuml.ensemble import RandomForestRegressor as curfc + from cuml.ensemble import RandomForestRegressor as curfr X = np.asarray([[0,10],[0,20],[0,30],[0,40]], dtype=np.float32) y = np.asarray([0.0,1.0,2.0,3.0], dtype=np.float32) - cuml_model = curfc(max_features=1.0, n_bins=128, + cuml_model = curfr(max_features=1.0, n_bins=128, min_samples_leaf=1, min_samples_split=2, n_estimators=40, accuracy_metric='r2')