From da872572d5852cdd7d024f16f625abecf992e451 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Mon, 1 Feb 2021 13:24:08 -0500 Subject: [PATCH 01/29] Prepare Changelog for Automation (#3442) This PR prepares the changelog to be automatically updated during releases. Authors: - AJ Schmidt (@ajschmidt8) Approvers: - Dante Gama Dessavre (@dantegd) URL: https://github.com/rapidsai/cuml/pull/3442 --- CHANGELOG.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36cbbf0cae..30ee4f829a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,8 @@ -# cuML 0.18.0 (Date TBD) +# 0.18.0 -## New Features - -## Improvements - -## Bug Fixes -- PR #3279: Correct pure virtual declaration in manifold_inputs_t +Please see https://github.com/rapidsai/cuml/releases/tag/branch-0.18-latest for the latest changes to this development branch. -# cuML 0.17.0 (Date TBD) +# cuML 0.17.0 (10 Dec 2020) ## New Features - PR #3164: Expose silhouette score in Python From df67553234d3faead723b345b323155b1b8ae0ca Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 1 Feb 2021 13:30:05 -0600 Subject: [PATCH 02/29] Allow saving Dask RandomForest models immediately after training (fixes #3331) (#3388) This attempts to fix #3331. See that issue for a lot more details. Today, `.get_combined_model()` for the Dask RandomForest model objects returns `None` if it's called immediately after training. That pattern is recommended in ["Distributed Model Pickling"](https://docs.rapids.ai/api/cuml/stable/pickling_cuml_models.html#Distributed-Model-Pickling). Without this support, there is not a way to save a Dask RandomForest model using only public methods / attributes on those classes. Per https://github.com/rapidsai/cuml/issues/3331#issuecomment-754125703, this PR proposes populating the internal model object whenever `get_combined_model()` is called. ## Notes for Reviewers * I have not tested this locally. I spent about 3 hours trying to build `cuml` from source following https://github.com/rapidsai/cuml/blob/main/BUILD.md, and was not successful. If there is a containerized setup for developing `cuml`, I'd greatly appreciate it and would be happy to try it out. I've added a unit test for this change, so I hope that will be enough to confirm that this works and that CI will catch any mistakes I've made. Thanks for your time and consideration. Authors: - James Lamb (@jameslamb) - John Zedlewski (@JohnZed) Approvers: - John Zedlewski (@JohnZed) URL: https://github.com/rapidsai/cuml/pull/3388 --- python/cuml/dask/ensemble/base.py | 31 +++++++++++++++ python/cuml/test/dask/test_random_forest.py | 44 +++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py index 45e059c0fc..6505a44c1c 100644 --- a/python/cuml/dask/ensemble/base.py +++ b/python/cuml/dask/ensemble/base.py @@ -19,6 +19,9 @@ import numpy as np import warnings +from collections.abc import Iterable +from dask.distributed import Future + from cuml.dask.common.input_utils import DistributedDataHandler, \ concatenate from cuml.dask.common.utils import get_client, wait_and_raise_from_futures @@ -257,6 +260,34 @@ def _get_json(self): combined_dump.extend(obj) return json.dumps(combined_dump) + def get_combined_model(self): + """ + Return single-GPU model for serialization. + + Returns + ------- + + model : Trained single-GPU model or None if the model has not + yet been trained. + """ + + # set internal model if it hasn't been accessed before + if self._get_internal_model() is None: + self._set_internal_model(self._concat_treelite_models()) + + internal_model = self._check_internal_model(self._get_internal_model()) + + if isinstance(self.internal_model, Iterable): + # This function needs to return a single instance of cuml.Base, + # even if the class is just a composite. + raise ValueError("Expected a single instance of cuml.Base " + "but got %s instead." % type(self.internal_model)) + + elif isinstance(self.internal_model, Future): + internal_model = self.internal_model.result() + + return internal_model + def _func_fit(model, input_data, convert_dtype): X = concatenate([item[0] for item in input_data]) diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py index 305427383e..3d97f1f381 100644 --- a/python/cuml/test/dask/test_random_forest.py +++ b/python/cuml/test/dask/test_random_forest.py @@ -43,6 +43,9 @@ from cuml.dask.ensemble import RandomForestRegressor as cuRFR_mg from cuml.dask.common import utils as dask_utils +from cuml.ensemble import RandomForestClassifier as cuRFC_sg +from cuml.ensemble import RandomForestRegressor as cuRFR_sg + from dask.array import from_array from sklearn.datasets import make_regression, make_classification from sklearn.model_selection import train_test_split @@ -436,6 +439,47 @@ def predict_with_json_rf_regressor(rf, x): np.testing.assert_almost_equal(pred, expected_pred, decimal=6) +@pytest.mark.parametrize('estimator_type', ['regression', 'classification']) +def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): + max_depth = 3 + n_estimators = 5 + X, y = make_classification() + X = X.astype(np.float32) + if estimator_type == 'classification': + cu_rf_mg = cuRFC_mg( + max_features=1.0, + max_samples=1.0, + n_bins=16, + n_streams=1, + n_estimators=n_estimators, + max_leaves=-1, + max_depth=max_depth + ) + y = y.astype(np.int32) + elif estimator_type == 'regression': + cu_rf_mg = cuRFR_mg( + max_features=1.0, + max_samples=1.0, + n_bins=16, + n_streams=1, + n_estimators=n_estimators, + max_leaves=-1, + max_depth=max_depth + ) + y = y.astype(np.float32) + else: + assert False + X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) + cu_rf_mg.fit(X_dask, y_dask) + single_gpu_model = cu_rf_mg.get_combined_model() + if estimator_type == 'classification': + assert isinstance(single_gpu_model, cuRFC_sg) + elif estimator_type == 'regression': + assert isinstance(single_gpu_model, cuRFR_sg) + else: + assert False + + @pytest.mark.parametrize('n_estimators', [5, 10, 20]) @pytest.mark.parametrize('detailed_text', [True, False]) def test_rf_get_text(client, n_estimators, detailed_text): From fa9edbc268ac93e5ca6ed69704f6cd0c55b25f4c Mon Sep 17 00:00:00 2001 From: Vinay Deshpande Date: Tue, 2 Feb 2021 20:53:24 +0530 Subject: [PATCH 03/29] Enable feature sampling for the experimental backend of Random Forest (#3364) Feature sampling was not supported for experimental backend of Random Forest (RF). This PR implements feature sampling without needing any auxiliary memory storage. Thus the `colids` array is also removed. Authors: - Vinay Deshpande (@vinaydes) Approvers: - Thejaswi. N. S (@teju85) - Philip Hyunsu Cho (@hcho3) - John Zedlewski (@JohnZed) URL: https://github.com/rapidsai/cuml/pull/3364 --- cpp/bench/sg/fil.cu | 15 ++-- cpp/bench/sg/rf_classifier.cu | 14 +-- cpp/bench/sg/rf_regressor.cu | 14 +-- cpp/include/cuml/ensemble/randomforest.hpp | 11 +-- cpp/include/cuml/tree/decisiontree.hpp | 16 ++-- cpp/include/cuml/tree/flatnode.h | 1 + .../batched-levelalgo/builder.cuh | 35 ++++---- .../batched-levelalgo/builder_base.cuh | 25 ++++-- .../decisiontree/batched-levelalgo/input.cuh | 4 +- .../batched-levelalgo/kernels.cuh | 90 +++++++++++++++++-- .../decisiontree/batched-levelalgo/node.cuh | 4 +- .../decisiontree/batched-levelalgo/split.cuh | 4 +- cpp/src/decisiontree/decisiontree.cu | 28 +++--- cpp/src/decisiontree/decisiontree_impl.cuh | 41 +++++---- cpp/src/decisiontree/decisiontree_impl.h | 12 +-- cpp/src/randomforest/randomforest.cu | 8 +- cpp/src/randomforest/randomforest_impl.cuh | 9 +- cpp/test/sg/decisiontree_batchedlevel_algo.cu | 19 ++-- .../sg/decisiontree_batchedlevel_unittest.cu | 12 +-- cpp/test/sg/rf_accuracy_test.cu | 4 +- cpp/test/sg/rf_batched_classification_test.cu | 4 +- cpp/test/sg/rf_batched_regression_test.cu | 4 +- cpp/test/sg/rf_depth_test.cu | 6 +- cpp/test/sg/rf_test.cu | 6 +- cpp/test/sg/rf_treelite_test.cu | 4 +- python/cuml/ensemble/randomforest_shared.pxd | 6 +- .../cuml/ensemble/randomforestclassifier.pyx | 7 +- .../cuml/ensemble/randomforestregressor.pyx | 7 +- python/cuml/test/test_random_forest.py | 6 +- 29 files changed, 253 insertions(+), 163 deletions(-) diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu index a5a9d7579c..7a80effc64 100644 --- a/cpp/bench/sg/fil.cu +++ b/cpp/bench/sg/fil.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,12 +143,13 @@ std::vector getInputs() { .shuffle = false, .seed = 12345ULL}; - set_rf_params(p.rf, // Output RF parameters - 1, // n_trees, just a placeholder value, anyway changed below - true, // bootstrap - 1.f, // max_samples - 1234, // seed - 8); // n_streams + set_rf_params(p.rf, // Output RF parameters + 1, // n_trees, just a placeholder value, + // anyway changed below + true, // bootstrap + 1.f, // max_samples + 1234ULL, // seed + 8); // n_streams set_tree_params(p.rf.tree_params, // Output tree parameters 10, // max_depth, just a placeholder value, diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu index 5e2f4f3fcf..29a9d0c371 100644 --- a/cpp/bench/sg/rf_classifier.cu +++ b/cpp/bench/sg/rf_classifier.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,12 +83,12 @@ std::vector getInputs() { 10.0, // center_box_max 2152953ULL}; //seed - set_rf_params(p.rf, // Output RF parameters - 500, // n_trees - true, // bootstrap - 1.f, // max_samples - 1234, // seed - 8); // n_streams + set_rf_params(p.rf, // Output RF parameters + 500, // n_trees + true, // bootstrap + 1.f, // max_samples + 1234ULL, // seed + 8); // n_streams set_tree_params(p.rf.tree_params, // Output tree parameters 10, // max_depth, this is anyway changed below diff --git a/cpp/bench/sg/rf_regressor.cu b/cpp/bench/sg/rf_regressor.cu index 59d01770ec..a6771ae567 100644 --- a/cpp/bench/sg/rf_regressor.cu +++ b/cpp/bench/sg/rf_regressor.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -85,12 +85,12 @@ std::vector getInputs() { .noise = 1.0, .seed = 12345ULL}; - set_rf_params(p.rf, // Output RF parameters - 500, // n_trees - true, // bootstrap - 1.f, // max_samples - 1234, // seed - 8); // n_streams + set_rf_params(p.rf, // Output RF parameters + 500, // n_trees + true, // bootstrap + 1.f, // max_samples + 1234ULL, // seed + 8); // n_streams set_tree_params(p.rf.tree_params, // Output tree parameters 10, // max_depth, just a place holder value, diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index 32a9447a63..525d6e07ea 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,7 +77,7 @@ struct RF_params { /** * random seed */ - int seed; + uint64_t seed; /** * Number of concurrent GPU streams for parallel tree building. * Each stream is independently managed by CPU thread. @@ -89,9 +89,10 @@ struct RF_params { void set_rf_params(RF_params& params, int cfg_n_trees = 1, bool cfg_bootstrap = true, float cfg_max_samples = 1.0f, - int cfg_seed = -1, int cfg_n_streams = 8); + uint64_t cfg_seed = 0, int cfg_n_streams = 8); void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, - float cfg_max_samples, int cfg_seed, int cfg_n_streams, + float cfg_max_samples, uint64_t cfg_seed, + int cfg_n_streams, DecisionTree::DecisionTreeParams cfg_tree_params); void validity_check(const RF_params rf_params); void print(const RF_params rf_params); @@ -190,7 +191,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, int n_bins, int split_algo, int min_samples_leaf, int min_samples_split, float min_impurity_decrease, bool bootstrap_features, bool bootstrap, int n_trees, - float max_samples, int seed, + float max_samples, uint64_t seed, CRITERION split_criterion, bool quantile_per_tree, int cfg_n_streams, bool use_experimental_backend, int max_batch_size); diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp index 523536daf8..91b03a87d5 100644 --- a/cpp/include/cuml/tree/decisiontree.hpp +++ b/cpp/include/cuml/tree/decisiontree.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -200,6 +200,7 @@ typedef TreeMetaDataNode TreeClassifierD; * @param[in] n_unique_labels: number of unique label values. Number of * categories of classification. * @param[in] tree_params: Decision Tree training hyper parameter struct. + * @param[in] seed: Controls the randomness in tree fitting/growing algorithm. * @{ */ void decisionTreeClassifierFit(const raft::handle_t &handle, @@ -207,13 +208,15 @@ void decisionTreeClassifierFit(const raft::handle_t &handle, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - DecisionTree::DecisionTreeParams tree_params); + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed); void decisionTreeClassifierFit(const raft::handle_t &handle, TreeClassifierD *&tree, double *data, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - DecisionTree::DecisionTreeParams tree_params); + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed); /** @} */ /** @@ -268,18 +271,21 @@ typedef TreeMetaDataNode TreeRegressorD; * @param[in] n_sampled_rows: number of training samples, after sampling. If using decision * tree directly over the whole dataset: n_sampled_rows = nrows * @param[in] tree_params: Decision Tree training hyper parameter struct. + * @param[in] seed: Controls the randomness in tree fitting/growing algorithm. * @{ */ void decisionTreeRegressorFit(const raft::handle_t &handle, TreeRegressorF *&tree, float *data, const int ncols, const int nrows, float *labels, unsigned int *rowids, const int n_sampled_rows, - DecisionTree::DecisionTreeParams tree_params); + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed); void decisionTreeRegressorFit(const raft::handle_t &handle, TreeRegressorD *&tree, double *data, const int ncols, const int nrows, double *labels, unsigned int *rowids, const int n_sampled_rows, - DecisionTree::DecisionTreeParams tree_params); + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed); /** @} */ /** diff --git a/cpp/include/cuml/tree/flatnode.h b/cpp/include/cuml/tree/flatnode.h index 329961567b..138182e550 100644 --- a/cpp/include/cuml/tree/flatnode.h +++ b/cpp/include/cuml/tree/flatnode.h @@ -31,6 +31,7 @@ struct SparseTreeNode { DataT quesval; DataT best_metric_val; IdxT left_child_id = IdxT(-1); + uint32_t unique_id = UINT32_MAX; }; template diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index 79d18022b1..5100b6038a 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,17 +42,18 @@ template void grow_tree(std::shared_ptr d_allocator, std::shared_ptr h_allocator, - const DataT* data, IdxT ncols, IdxT nrows, const LabelT* labels, - const DataT* quantiles, IdxT* rowids, IdxT* colids, - int n_sampled_rows, int unique_labels, + const DataT* data, IdxT treeid, uint64_t seed, IdxT ncols, + IdxT nrows, const LabelT* labels, const DataT* quantiles, + IdxT* rowids, int n_sampled_rows, int unique_labels, const DecisionTreeParams& params, cudaStream_t stream, std::vector>& sparsetree, IdxT& num_leaves, IdxT& depth) { Builder builder; size_t d_wsize, h_wsize; - builder.workspaceSize(d_wsize, h_wsize, params, data, labels, nrows, ncols, - n_sampled_rows, IdxT(params.max_features * ncols), - rowids, colids, unique_labels, quantiles); + builder.workspaceSize(d_wsize, h_wsize, treeid, seed, params, data, labels, + nrows, ncols, n_sampled_rows, + IdxT(params.max_features * ncols), rowids, + unique_labels, quantiles); MLCommon::device_buffer d_buff(d_allocator, stream, d_wsize); MLCommon::host_buffer h_buff(h_allocator, stream, h_wsize); @@ -100,29 +101,29 @@ void grow_tree(std::shared_ptr d_allocator, template void grow_tree(std::shared_ptr d_allocator, std::shared_ptr h_allocator, - const DataT* data, IdxT ncols, IdxT nrows, const LabelT* labels, - const DataT* quantiles, IdxT* rowids, IdxT* colids, - int n_sampled_rows, int unique_labels, + const DataT* data, IdxT treeid, uint64_t seed, IdxT ncols, + IdxT nrows, const LabelT* labels, const DataT* quantiles, + IdxT* rowids, int n_sampled_rows, int unique_labels, const DecisionTreeParams& params, cudaStream_t stream, std::vector>& sparsetree, IdxT& num_leaves, IdxT& depth) { typedef ClsTraits Traits; - grow_tree(d_allocator, h_allocator, data, ncols, nrows, labels, - quantiles, rowids, colids, n_sampled_rows, unique_labels, + grow_tree(d_allocator, h_allocator, data, treeid, seed, ncols, nrows, + labels, quantiles, rowids, n_sampled_rows, unique_labels, params, stream, sparsetree, num_leaves, depth); } template void grow_tree(std::shared_ptr d_allocator, std::shared_ptr h_allocator, - const DataT* data, IdxT ncols, IdxT nrows, const DataT* labels, - const DataT* quantiles, IdxT* rowids, IdxT* colids, - int n_sampled_rows, int unique_labels, + const DataT* data, IdxT treeid, uint64_t seed, IdxT ncols, + IdxT nrows, const DataT* labels, const DataT* quantiles, + IdxT* rowids, int n_sampled_rows, int unique_labels, const DecisionTreeParams& params, cudaStream_t stream, std::vector>& sparsetree, IdxT& num_leaves, IdxT& depth) { typedef RegTraits Traits; - grow_tree(d_allocator, h_allocator, data, ncols, nrows, labels, - quantiles, rowids, colids, n_sampled_rows, unique_labels, + grow_tree(d_allocator, h_allocator, data, treeid, seed, ncols, nrows, + labels, quantiles, rowids, n_sampled_rows, unique_labels, params, stream, sparsetree, num_leaves, depth); } /** @} */ diff --git a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh index 030b08e8e8..79ac2ddda5 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,6 +60,10 @@ struct Builder { /** size of block-sync workspace (regression + MAE only) */ size_t block_sync_size; + /** Tree index */ + IdxT treeid; + /** Seed used for randomization */ + uint64_t seed; /** number of nodes created in the current batch */ IdxT* n_nodes; /** class histograms (classification only) */ @@ -133,14 +137,16 @@ struct Builder { * be computed fresh. [on device] [col-major] * [dim = nbins x sampledCols] */ - void workspaceSize(size_t& d_wsize, size_t& h_wsize, - const DecisionTreeParams& p, const DataT* data, - const LabelT* labels, IdxT totalRows, IdxT totalCols, - IdxT sampledRows, IdxT sampledCols, IdxT* rowids, - IdxT* colids, IdxT nclasses, const DataT* quantiles) { + void workspaceSize(size_t& d_wsize, size_t& h_wsize, IdxT treeid, + uint64_t seed, const DecisionTreeParams& p, + const DataT* data, const LabelT* labels, IdxT totalRows, + IdxT totalCols, IdxT sampledRows, IdxT sampledCols, + IdxT* rowids, IdxT nclasses, const DataT* quantiles) { ASSERT(quantiles != nullptr, "Currently quantiles need to be computed before this call!"); params = p; + this->treeid = treeid; + this->seed = seed; n_blks_for_cols = std::min(sampledCols, n_blks_for_cols); input.data = data; input.labels = labels; @@ -149,7 +155,6 @@ struct Builder { input.nSampledRows = sampledRows; input.nSampledCols = sampledCols; input.rowids = rowids; - input.colids = colids; input.nclasses = nclasses; input.quantiles = quantiles; auto max_batch = params.max_batch_size; @@ -294,6 +299,7 @@ struct Builder { h_nodes[0].start = 0; h_nodes[0].count = input.nSampledRows; h_nodes[0].depth = 0; + h_nodes[0].info.unique_id = 0; } /** check whether any more nodes need to be processed or not */ @@ -323,6 +329,7 @@ struct Builder { // start fresh on the number of *new* nodes created in this batch CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), s)); initSplit(splits, batchSize, s); + // get the current set of nodes to be worked upon raft::update_device(curr_nodes, h_nodes.data() + node_start, batchSize, s); // iterate through a batch of columns (to reduce the memory pressure) and @@ -404,7 +411,7 @@ struct ClsTraits { b.hist, b.params.n_bins, b.params.max_depth, b.params.min_samples_split, b.params.min_samples_leaf, b.params.min_impurity_decrease, b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex, - b.n_leaves, b.splits, splitType); + b.n_leaves, b.splits, splitType, b.treeid, b.seed); } /** @@ -480,7 +487,7 @@ struct RegTraits { b.params.max_depth, b.params.min_samples_split, b.params.min_samples_leaf, b.params.min_impurity_decrease, b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex, - b.n_leaves, b.splits, b.block_sync, splitType); + b.n_leaves, b.splits, b.block_sync, splitType, b.treeid, b.seed); } /** diff --git a/cpp/src/decisiontree/batched-levelalgo/input.cuh b/cpp/src/decisiontree/batched-levelalgo/input.cuh index 7e3a54c908..c87032ee9a 100644 --- a/cpp/src/decisiontree/batched-levelalgo/input.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/input.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,8 +35,6 @@ struct Input { IdxT nSampledCols; /** indices of sampled rows */ IdxT* rowids; - /** indices of sampled cols */ - IdxT* colids; /** number of classes (useful only in classification) */ IdxT nclasses; /** quantiles/histogram computed on the dataset (col-major) */ diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh index 40ae3de461..1a35eb9f40 100644 --- a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -264,13 +264,78 @@ __device__ OutT* alignPointer(InT input) { raft::alignTo(reinterpret_cast(input), sizeof(OutT))); } +// 32-bit FNV1a hash +// Reference: http://www.isthe.com/chongo/tech/comp/fnv/index.html +const uint32_t fnv1a32_prime = uint32_t(16777619); +const uint32_t fnv1a32_basis = uint32_t(2166136261); + +DI uint32_t fnv1a32(uint32_t hash, uint32_t txt) { + hash ^= (txt >> 0) & 0xFF; + hash *= fnv1a32_prime; + hash ^= (txt >> 8) & 0xFF; + hash *= fnv1a32_prime; + hash ^= (txt >> 16) & 0xFF; + hash *= fnv1a32_prime; + hash ^= (txt >> 24) & 0xFF; + hash *= fnv1a32_prime; + return hash; +} + +/** + * @brief For a given values of (treeid, nodeid, seed), this function generates + * a unique permutation of [0, N - 1] values and returns 'k'th entry in + * from the permutation. + * @return The 'k'th value from the permutation + * @note This function does not allocated any temporary buffer, all the + * necessary values are recomputed. + */ +template +DI IdxT select(IdxT k, IdxT treeid, uint32_t nodeid, uint64_t seed, IdxT N) { + __shared__ int blksum; + uint32_t pivot_hash; + int cnt = 0; + + if (threadIdx.x == 0) { + blksum = 0; + } + // Compute hash for the 'k'th index and use it as pivote for sorting + pivot_hash = fnv1a32_basis; + pivot_hash = fnv1a32(pivot_hash, uint32_t(k)); + pivot_hash = fnv1a32(pivot_hash, uint32_t(treeid)); + pivot_hash = fnv1a32(pivot_hash, uint32_t(nodeid)); + pivot_hash = fnv1a32(pivot_hash, uint32_t(seed >> 32)); + pivot_hash = fnv1a32(pivot_hash, uint32_t(seed)); + + // Compute hash for rest of the indices and count instances where i_hash is + // less than pivot_hash + uint32_t i_hash; + for (int i = threadIdx.x; i < N; i += blockDim.x) { + if (i == k) continue; // Skip since k is the pivote index + i_hash = fnv1a32_basis; + i_hash = fnv1a32(i_hash, uint32_t(i)); + i_hash = fnv1a32(i_hash, uint32_t(treeid)); + i_hash = fnv1a32(i_hash, uint32_t(nodeid)); + i_hash = fnv1a32(i_hash, uint32_t(seed >> 32)); + i_hash = fnv1a32(i_hash, uint32_t(seed)); + + if (i_hash < pivot_hash) + cnt++; + else if (i_hash == pivot_hash && i < k) + cnt++; + } + __syncthreads(); + if (cnt > 0) atomicAdd(&blksum, cnt); + __syncthreads(); + return blksum; +} + template __global__ void computeSplitClassificationKernel( int* hist, IdxT nbins, IdxT max_depth, IdxT min_samples_split, IdxT min_samples_leaf, DataT min_impurity_decrease, IdxT max_leaves, Input input, const Node* nodes, IdxT colStart, int* done_count, int* mutex, const IdxT* n_leaves, - Split* splits, CRITERION splitType) { + Split* splits, CRITERION splitType, IdxT treeid, uint64_t seed) { extern __shared__ char smem[]; IdxT nid = blockIdx.z; auto node = nodes[nid]; @@ -288,7 +353,15 @@ __global__ void computeSplitClassificationKernel( auto* sDone = alignPointer(sbins + nbins); IdxT stride = blockDim.x * gridDim.x; IdxT tid = threadIdx.x + blockIdx.x * blockDim.x; - auto col = input.colids[colStart + blockIdx.y]; + + IdxT col; + if (input.nSampledCols == input.N) { + col = colStart + blockIdx.y; + } else { + int colIndex = colStart + blockIdx.y; + col = select(colIndex, treeid, node.info.unique_id, seed, input.N); + } + for (IdxT i = threadIdx.x; i < len; i += blockDim.x) shist[i] = 0; for (IdxT b = threadIdx.x; b < nbins; b += blockDim.x) sbins[b] = input.quantiles[col * nbins + b]; @@ -343,7 +416,8 @@ __global__ void computeSplitRegressionKernel( DataT min_impurity_decrease, IdxT max_leaves, Input input, const Node* nodes, IdxT colStart, int* done_count, int* mutex, const IdxT* n_leaves, - Split* splits, void* workspace, CRITERION splitType) { + Split* splits, void* workspace, CRITERION splitType, IdxT treeid, + uint64_t seed) { extern __shared__ char smem[]; IdxT nid = blockIdx.z; auto node = nodes[nid]; @@ -364,7 +438,13 @@ __global__ void computeSplitRegressionKernel( auto* sDone = alignPointer(spredP + nbins); IdxT stride = blockDim.x * gridDim.x; IdxT tid = threadIdx.x + blockIdx.x * blockDim.x; - auto col = input.colids[colStart + blockIdx.y]; + IdxT col; + if (input.nSampledCols == input.N) { + col = colStart + blockIdx.y; + } else { + int colIndex = colStart + blockIdx.y; + col = select(colIndex, treeid, node.info.unique_id, seed, input.N); + } for (IdxT i = threadIdx.x; i < len; i += blockDim.x) { spred[i] = DataT(0.0); } diff --git a/cpp/src/decisiontree/batched-levelalgo/node.cuh b/cpp/src/decisiontree/batched-levelalgo/node.cuh index 827ef914a2..68c4652066 100644 --- a/cpp/src/decisiontree/batched-levelalgo/node.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/node.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -104,12 +104,14 @@ struct Node { nodes[pos].depth = depth + 1; nodes[pos].start = start; nodes[pos].count = split.nLeft; + nodes[pos].info.unique_id = 2 * info.unique_id + 1; // right ++pos; nodes[pos].initSpNode(); nodes[pos].depth = depth + 1; nodes[pos].start = start + split.nLeft; nodes[pos].count = count - split.nLeft; + nodes[pos].info.unique_id = 2 * info.unique_id + 1; // update depth auto val = atomicMax(n_depth, depth + 1); __threadfence(); diff --git a/cpp/src/decisiontree/batched-levelalgo/split.cuh b/cpp/src/decisiontree/batched-levelalgo/split.cuh index e05091ded5..3417d540bd 100644 --- a/cpp/src/decisiontree/batched-levelalgo/split.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/split.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,7 +148,7 @@ void initSplit(Split* splits, IdxT len, cudaStream_t s) { template void printSplits(Split* splits, IdxT len, cudaStream_t s) { auto op = [] __device__(Split * ptr, IdxT idx) { - printf("quesval = %f, colid = %d, best_metric_val = %f, nLeft = %d\n", + printf("quesval = %e, colid = %d, best_metric_val = %e, nLeft = %d\n", ptr->quesval, ptr->colid, ptr->best_metric_val, ptr->nLeft); }; raft::linalg::writeOnlyUnaryOp, decltype(op), IdxT, TPB>( diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu index 7d6e900061..8f3ded17da 100644 --- a/cpp/src/decisiontree/decisiontree.cu +++ b/cpp/src/decisiontree/decisiontree.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,12 +56,6 @@ void set_tree_params(DecisionTreeParams ¶ms, int cfg_max_depth, "To use experimental backend set split_algo = 1 (GLOBAL_QUANTILE)"); cfg_use_experimental_backend = false; } - if (cfg_max_features != 1.0) { - CUML_LOG_WARN( - "Experimental backend does not yet support feature sub-sampling"); - CUML_LOG_WARN("To use experimental backend set max_features = 1.0"); - cfg_use_experimental_backend = false; - } if (cfg_quantile_per_tree) { CUML_LOG_WARN( "Experimental backend does not yet support per tree quantile " @@ -160,11 +154,12 @@ void decisionTreeClassifierFit(const raft::handle_t &handle, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - DecisionTree::DecisionTreeParams tree_params) { + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed) { std::shared_ptr> dt_classifier = std::make_shared>(); dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, - unique_labels, tree, tree_params); + unique_labels, tree, tree_params, seed); } void decisionTreeClassifierFit(const raft::handle_t &handle, @@ -172,11 +167,12 @@ void decisionTreeClassifierFit(const raft::handle_t &handle, const int ncols, const int nrows, int *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - DecisionTree::DecisionTreeParams tree_params) { + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed) { std::shared_ptr> dt_classifier = std::make_shared>(); dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, - unique_labels, tree, tree_params); + unique_labels, tree, tree_params, seed); } void decisionTreeClassifierPredict(const raft::handle_t &handle, @@ -207,22 +203,24 @@ void decisionTreeRegressorFit(const raft::handle_t &handle, TreeRegressorF *&tree, float *data, const int ncols, const int nrows, float *labels, unsigned int *rowids, const int n_sampled_rows, - DecisionTree::DecisionTreeParams tree_params) { + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed) { std::shared_ptr> dt_regressor = std::make_shared>(); dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, - tree, tree_params); + tree, tree_params, seed); } void decisionTreeRegressorFit(const raft::handle_t &handle, TreeRegressorD *&tree, double *data, const int ncols, const int nrows, double *labels, unsigned int *rowids, const int n_sampled_rows, - DecisionTree::DecisionTreeParams tree_params) { + DecisionTree::DecisionTreeParams tree_params, + uint64_t seed) { std::shared_ptr> dt_regressor = std::make_shared>(); dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows, - tree, tree_params); + tree, tree_params, seed); } void decisionTreeRegressorPredict(const raft::handle_t &handle, diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh index b1f70818a0..f742643471 100644 --- a/cpp/src/decisiontree/decisiontree_impl.cuh +++ b/cpp/src/decisiontree/decisiontree_impl.cuh @@ -51,10 +51,15 @@ namespace DecisionTree { template void print(const SparseTreeNode &node, std::ostream &os) { if (node.colid == -1) { - os << "(leaf, " << node.prediction << ", " << node.best_metric_val << ")"; + os << "(leaf, " + << "prediction: " << node.prediction + << ", best_metric_val: " << node.best_metric_val + << ", UID: " << node.unique_id << ")"; } else { - os << "(" << node.colid << ", " << node.quesval << ", " - << node.best_metric_val << ")"; + os << "(" + << "colid: " << node.colid << ", quesval: " << node.quesval + << ", best_metric_val: " << node.best_metric_val + << ", UID: " << node.unique_id << ")"; } return; } @@ -255,7 +260,8 @@ template void DecisionTreeBase::plant( std::vector> &sparsetree, const T *data, const int ncols, const int nrows, const L *labels, unsigned int *rowids, - const int n_sampled_rows, int unique_labels, const int treeid) { + const int n_sampled_rows, int unique_labels, const int treeid, + uint64_t seed) { dinfo.NLocalrows = nrows; dinfo.NGlobalrows = nrows; dinfo.Ncols = ncols; @@ -288,11 +294,8 @@ void DecisionTreeBase::plant( CUML_LOG_WARN("Using experimental backend for growing trees\n"); } T *quantiles = tempmem->d_quantile->data(); - int *colids = (int *)tempmem->device_allocator->allocate( - sizeof(int) * ncols, tempmem->stream); - MLCommon::iota(colids, 0, 1, ncols, tempmem->stream); - grow_tree(tempmem->device_allocator, tempmem->host_allocator, data, ncols, - nrows, labels, quantiles, (int *)rowids, (int *)colids, + grow_tree(tempmem->device_allocator, tempmem->host_allocator, data, treeid, + seed, ncols, nrows, labels, quantiles, (int *)rowids, n_sampled_rows, unique_labels, tree_params, tempmem->stream, sparsetree, this->leaf_counter, this->depth_counter); } else { @@ -371,7 +374,7 @@ void DecisionTreeBase::base_fit( const cudaStream_t stream_in, const T *data, const int ncols, const int nrows, const L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, std::vector> &sparsetree, - const int treeid, bool is_classifier, + const int treeid, uint64_t seed, bool is_classifier, std::shared_ptr> in_tempmem) { prepare_fit_timer.reset(); const char *CRITERION_NAME[] = {"GINI", "ENTROPY", "MSE", "MAE", "END"}; @@ -408,7 +411,7 @@ void DecisionTreeBase::base_fit( } plant(sparsetree, data, ncols, nrows, labels, rowids, n_sampled_rows, - unique_labels, treeid); + unique_labels, treeid, seed); if (in_tempmem == nullptr) { tempmem.reset(); } @@ -419,13 +422,13 @@ void DecisionTreeClassifier::fit( const raft::handle_t &handle, const T *data, const int ncols, const int nrows, const int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, TreeMetaDataNode *&tree, - DecisionTreeParams tree_parameters, + DecisionTreeParams tree_parameters, uint64_t seed, std::shared_ptr> in_tempmem) { this->tree_params = tree_parameters; this->base_fit(handle.get_device_allocator(), handle.get_host_allocator(), handle.get_stream(), data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, tree->sparsetree, tree->treeid, - true, in_tempmem); + seed, true, in_tempmem); this->set_metadata(tree); } @@ -437,12 +440,12 @@ void DecisionTreeClassifier::fit( const cudaStream_t stream_in, const T *data, const int ncols, const int nrows, const int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, TreeMetaDataNode *&tree, - DecisionTreeParams tree_parameters, + DecisionTreeParams tree_parameters, uint64_t seed, std::shared_ptr> in_tempmem) { this->tree_params = tree_parameters; this->base_fit(device_allocator_in, host_allocator_in, stream_in, data, ncols, nrows, labels, rowids, n_sampled_rows, unique_labels, - tree->sparsetree, tree->treeid, true, in_tempmem); + tree->sparsetree, tree->treeid, seed, true, in_tempmem); this->set_metadata(tree); } @@ -451,11 +454,11 @@ void DecisionTreeRegressor::fit( const raft::handle_t &handle, const T *data, const int ncols, const int nrows, const T *labels, unsigned int *rowids, const int n_sampled_rows, TreeMetaDataNode *&tree, DecisionTreeParams tree_parameters, - std::shared_ptr> in_tempmem) { + uint64_t seed, std::shared_ptr> in_tempmem) { this->tree_params = tree_parameters; this->base_fit(handle.get_device_allocator(), handle.get_host_allocator(), handle.get_stream(), data, ncols, nrows, labels, rowids, - n_sampled_rows, 1, tree->sparsetree, tree->treeid, false, + n_sampled_rows, 1, tree->sparsetree, tree->treeid, seed, false, in_tempmem); this->set_metadata(tree); } @@ -467,11 +470,11 @@ void DecisionTreeRegressor::fit( const cudaStream_t stream_in, const T *data, const int ncols, const int nrows, const T *labels, unsigned int *rowids, const int n_sampled_rows, TreeMetaDataNode *&tree, DecisionTreeParams tree_parameters, - std::shared_ptr> in_tempmem) { + uint64_t seed, std::shared_ptr> in_tempmem) { this->tree_params = tree_parameters; this->base_fit(device_allocator_in, host_allocator_in, stream_in, data, ncols, nrows, labels, rowids, n_sampled_rows, 1, tree->sparsetree, - tree->treeid, false, in_tempmem); + tree->treeid, seed, false, in_tempmem); this->set_metadata(tree); } diff --git a/cpp/src/decisiontree/decisiontree_impl.h b/cpp/src/decisiontree/decisiontree_impl.h index 0004a3248f..7b7b65822c 100644 --- a/cpp/src/decisiontree/decisiontree_impl.h +++ b/cpp/src/decisiontree/decisiontree_impl.h @@ -90,7 +90,7 @@ class DecisionTreeBase { void plant(std::vector> &sparsetree, const T *data, const int ncols, const int nrows, const L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, - const int treeid); + const int treeid, uint64_t seed); virtual void grow_deep_tree( const T *data, const L *labels, unsigned int *rowids, @@ -105,7 +105,8 @@ class DecisionTreeBase { const int nrows, const L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, std::vector> &sparsetree, const int treeid, - bool is_classifier, std::shared_ptr> in_tempmem); + uint64_t seed, bool is_classifier, + std::shared_ptr> in_tempmem); public: // Printing utility for high level tree info. @@ -138,6 +139,7 @@ class DecisionTreeClassifier : public DecisionTreeBase { const int nrows, const int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, TreeMetaDataNode *&tree, DecisionTreeParams tree_parameters, + uint64_t seed, std::shared_ptr> in_tempmem = nullptr); //This fit fucntion does not take handle , used by RF @@ -147,7 +149,7 @@ class DecisionTreeClassifier : public DecisionTreeBase { const int nrows, const int *labels, unsigned int *rowids, const int n_sampled_rows, const int unique_labels, TreeMetaDataNode *&tree, DecisionTreeParams tree_parameters, - std::shared_ptr> in_tempmem); + uint64_t seed, std::shared_ptr> in_tempmem); private: void grow_deep_tree(const T *data, const int *labels, unsigned int *rowids, @@ -165,7 +167,7 @@ class DecisionTreeRegressor : public DecisionTreeBase { void fit(const raft::handle_t &handle, const T *data, const int ncols, const int nrows, const T *labels, unsigned int *rowids, const int n_sampled_rows, TreeMetaDataNode *&tree, - DecisionTreeParams tree_parameters, + DecisionTreeParams tree_parameters, uint64_t seed, std::shared_ptr> in_tempmem = nullptr); //This fit function does not take handle. Used by RF @@ -174,7 +176,7 @@ class DecisionTreeRegressor : public DecisionTreeBase { const cudaStream_t stream_in, const T *data, const int ncols, const int nrows, const T *labels, unsigned int *rowids, const int n_sampled_rows, TreeMetaDataNode *&tree, - DecisionTreeParams tree_parameters, + DecisionTreeParams tree_parameters, uint64_t seed, std::shared_ptr> in_tempmem); private: diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 52c4b4153c..4438bbfa60 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -162,7 +162,8 @@ void postprocess_labels(int n_rows, std::vector& labels, * @param[in] cfg_n_streams: No of parallel CUDA for training forest */ void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, - float cfg_max_samples, int cfg_seed, int cfg_n_streams) { + float cfg_max_samples, uint64_t cfg_seed, + int cfg_n_streams) { params.n_trees = cfg_n_trees; params.bootstrap = cfg_bootstrap; params.max_samples = cfg_max_samples; @@ -186,7 +187,8 @@ void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, * @param[in] cfg_tree_params: tree parameters */ void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, - float cfg_max_samples, int cfg_seed, int cfg_n_streams, + float cfg_max_samples, uint64_t cfg_seed, + int cfg_n_streams, DecisionTree::DecisionTreeParams cfg_tree_params) { params.n_trees = cfg_n_trees; params.bootstrap = cfg_bootstrap; @@ -651,7 +653,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, int n_bins, int split_algo, int min_samples_leaf, int min_samples_split, float min_impurity_decrease, bool bootstrap_features, bool bootstrap, int n_trees, - float max_samples, int seed, + float max_samples, uint64_t seed, CRITERION split_criterion, bool quantile_per_tree, int cfg_n_streams, bool use_experimental_backend, int max_batch_size) { diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh index e86719935b..04a0634749 100644 --- a/cpp/src/randomforest/randomforest_impl.cuh +++ b/cpp/src/randomforest/randomforest_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,7 +69,7 @@ void rf::prepare_fit_per_tree( const int num_sms, const cudaStream_t stream, const std::shared_ptr device_allocator) { int rs = tree_id; - if (rf_params.seed > -1) rs = rf_params.seed + tree_id; + if (rf_params.seed != 0) rs = rf_params.seed + tree_id; raft::random::Rng rng(rs * 1000 | 0xFF00AA, raft::random::GeneratorType::GenKiss99); @@ -230,7 +230,8 @@ void rfClassifier::fit(const raft::handle_t& user_handle, const T* input, trees[i].fit(handle.get_device_allocator(), handle.get_host_allocator(), tempmem[stream_id]->stream, input, n_cols, n_rows, labels, rowids, n_sampled_rows, n_unique_labels, tree_ptr, - this->rf_params.tree_params, tempmem[stream_id]); + this->rf_params.tree_params, this->rf_params.seed, + tempmem[stream_id]); } //Cleanup for (int i = 0; i < n_streams; i++) { @@ -506,7 +507,7 @@ void rfRegressor::fit(const raft::handle_t& user_handle, const T* input, trees[i].fit(handle.get_device_allocator(), handle.get_host_allocator(), tempmem[stream_id]->stream, input, n_cols, n_rows, labels, rowids, n_sampled_rows, tree_ptr, this->rf_params.tree_params, - tempmem[stream_id]); + this->rf_params.seed, tempmem[stream_id]); } //Cleanup for (int i = 0; i < n_streams; i++) { diff --git a/cpp/test/sg/decisiontree_batchedlevel_algo.cu b/cpp/test/sg/decisiontree_batchedlevel_algo.cu index 3528c1213d..7a8681a25e 100644 --- a/cpp/test/sg/decisiontree_batchedlevel_algo.cu +++ b/cpp/test/sg/decisiontree_batchedlevel_algo.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,8 +67,6 @@ class DtBaseTest : public ::testing::TestWithParam { allocator->deallocate(tmp, sizeof(T) * inparams.M * inparams.N, stream); rowids = (I*)allocator->allocate(sizeof(I) * inparams.M, stream); MLCommon::iota(rowids, 0, 1, inparams.M, stream); - colids = (I*)allocator->allocate(sizeof(I) * inparams.N, stream); - MLCommon::iota(colids, 0, 1, inparams.N, stream); quantiles = (T*)allocator->allocate(sizeof(T) * inparams.nbins * inparams.N, stream); @@ -86,7 +84,6 @@ class DtBaseTest : public ::testing::TestWithParam { allocator->deallocate(data, sizeof(T) * inparams.M * inparams.N, stream); allocator->deallocate(labels, sizeof(L) * inparams.M, stream); allocator->deallocate(rowids, sizeof(int) * inparams.M, stream); - allocator->deallocate(colids, sizeof(int) * inparams.N, stream); allocator->deallocate(quantiles, sizeof(T) * inparams.nbins * inparams.N, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -98,7 +95,7 @@ class DtBaseTest : public ::testing::TestWithParam { std::shared_ptr handle; T *data, *quantiles; L* labels; - I *rowids, *colids; + I* rowids; DecisionTreeParams params; DtTestParams inparams; std::vector> sparsetree; @@ -129,8 +126,8 @@ typedef DtClassifierTest DtClsTestF; TEST_P(DtClsTestF, Test) { int num_leaves, depth; grow_tree( - handle->get_device_allocator(), handle->get_host_allocator(), data, - inparams.N, inparams.M, labels, quantiles, rowids, colids, inparams.M, + handle->get_device_allocator(), handle->get_host_allocator(), data, 1, 0, + inparams.N, inparams.M, labels, quantiles, rowids, inparams.M, inparams.nclasses, params, stream, sparsetree, num_leaves, depth); // this is a "well behaved" dataset! ASSERT_EQ(depth, 1); @@ -162,10 +159,10 @@ typedef DtRegressorTest DtRegTestF; ///@todo: add checks TEST_P(DtRegTestF, Test) { int num_leaves, depth; - grow_tree( - handle->get_device_allocator(), handle->get_host_allocator(), data, - inparams.N, inparams.M, labels, quantiles, rowids, colids, inparams.M, 0, - params, stream, sparsetree, num_leaves, depth); + grow_tree(handle->get_device_allocator(), + handle->get_host_allocator(), data, 1, 0, inparams.N, + inparams.M, labels, quantiles, rowids, inparams.M, 0, + params, stream, sparsetree, num_leaves, depth); // goes all the way to max-depth ASSERT_EQ(depth, inparams.max_depth); } diff --git a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu index c9f47cfee4..3bd5e0a58a 100644 --- a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu +++ b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,8 +77,6 @@ class BatchedLevelAlgoUnitTestFixture { static_cast(d_allocator->allocate(sizeof(LabelT) * n_row, 0)); row_ids = static_cast(d_allocator->allocate(sizeof(IdxT) * n_row, 0)); - col_ids = - static_cast(d_allocator->allocate(sizeof(IdxT) * n_col, 0)); // Nodes that exist prior to the invocation of nodeSplitKernel() curr_nodes = @@ -99,7 +97,6 @@ class BatchedLevelAlgoUnitTestFixture { raft::update_device(data, h_data.data(), n_row * n_col, 0); raft::update_device(labels, h_labels.data(), n_row, 0); MLCommon::iota(row_ids, 0, 1, n_row, 0); - MLCommon::iota(col_ids, 0, 1, n_col, 0); tempmem = std::make_shared>( *raft_handle, cudaStream_t(0), n_row, n_col, 0, params); @@ -117,7 +114,6 @@ class BatchedLevelAlgoUnitTestFixture { input.nSampledRows = n_row; input.nSampledCols = n_col; input.rowids = row_ids; - input.colids = col_ids; input.nclasses = 0; // not applicable for regression input.quantiles = quantiles; } @@ -127,7 +123,6 @@ class BatchedLevelAlgoUnitTestFixture { d_allocator->deallocate(data, sizeof(DataT) * n_row * n_col, 0); d_allocator->deallocate(labels, sizeof(LabelT) * n_row, 0); d_allocator->deallocate(row_ids, sizeof(IdxT) * n_row, 0); - d_allocator->deallocate(col_ids, sizeof(IdxT) * n_col, 0); d_allocator->deallocate(curr_nodes, sizeof(NodeT) * max_batch, 0); d_allocator->deallocate(new_nodes, sizeof(NodeT) * 2 * max_batch, 0); d_allocator->deallocate(n_new_nodes, sizeof(IdxT), 0); @@ -157,7 +152,6 @@ class BatchedLevelAlgoUnitTestFixture { DataT* data; DataT* labels; IdxT* row_ids; - IdxT* col_ids; }; class TestQuantiles : public ::testing::TestWithParam, @@ -319,7 +313,9 @@ TEST_P(TestMetric, RegressionMetricGain) { pred, pred2, pred2P, pred_count, n_bins, params.max_depth, params.min_samples_split, params.min_samples_leaf, params.min_impurity_decrease, params.max_leaves, input, curr_nodes, 0, - done_count, mutex, n_new_leaves, splits, block_sync, split_criterion); + done_count, mutex, n_new_leaves, splits, block_sync, split_criterion, 0, + 1234ULL); + raft::update_host(h_splits.data(), splits, 1, 0); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaStreamSynchronize(0)); diff --git a/cpp/test/sg/rf_accuracy_test.cu b/cpp/test/sg/rf_accuracy_test.cu index f3ee0b2487..7ab48d60b6 100644 --- a/cpp/test/sg/rf_accuracy_test.cu +++ b/cpp/test/sg/rf_accuracy_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -99,7 +99,7 @@ class RFClassifierAccuracyTest : public ::testing::TestWithParam { set_all_rf_params(rfp, 1, /* n_trees */ true, /* bootstrap */ 1.0, /* max_samples */ - -1, /* seed */ + 0, /* seed */ 1, /* n_streams */ tree_params); } diff --git a/cpp/test/sg/rf_batched_classification_test.cu b/cpp/test/sg/rf_batched_classification_test.cu index b39bebab5f..4677d5732d 100644 --- a/cpp/test/sg/rf_batched_classification_test.cu +++ b/cpp/test/sg/rf_batched_classification_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,7 +60,7 @@ class RFBatchedClsTest : public ::testing::TestWithParam { params.split_criterion, false, true); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); CUDA_CHECK(cudaStreamCreate(&stream)); handle.reset(new raft::handle_t(rf_params.n_streams)); diff --git a/cpp/test/sg/rf_batched_regression_test.cu b/cpp/test/sg/rf_batched_regression_test.cu index 9c66586621..972b610cdd 100644 --- a/cpp/test/sg/rf_batched_regression_test.cu +++ b/cpp/test/sg/rf_batched_regression_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,7 +62,7 @@ class RFBatchedRegTest : public ::testing::TestWithParam { params.split_criterion, false, true); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); CUDA_CHECK(cudaStreamCreate(&stream)); handle.reset(new raft::handle_t(rf_params.n_streams)); diff --git a/cpp/test/sg/rf_depth_test.cu b/cpp/test/sg/rf_depth_test.cu index fa69d393e3..d4204343fe 100644 --- a/cpp/test/sg/rf_depth_test.cu +++ b/cpp/test/sg/rf_depth_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -75,7 +75,7 @@ class RfClassifierDepthTest : public ::testing::TestWithParam { params.split_criterion, false); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); int data_len = params.n_rows * params.n_cols; raft::allocate(data, data_len); @@ -169,7 +169,7 @@ class RfRegressorDepthTest : public ::testing::TestWithParam { params.split_criterion, false); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); int data_len = params.n_rows * params.n_cols; raft::allocate(data, data_len); diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 71a69b72e8..b4451cdd05 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,7 +64,7 @@ class RfClassifierTest : public ::testing::TestWithParam> { params.split_criterion, false); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); //print(rf_params); //-------------------------------------------------------- @@ -167,7 +167,7 @@ class RfRegressorTest : public ::testing::TestWithParam> { params.split_criterion, false); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); //print(rf_params); //-------------------------------------------------------- diff --git a/cpp/test/sg/rf_treelite_test.cu b/cpp/test/sg/rf_treelite_test.cu index b0dded3e88..31a6f4f9d0 100644 --- a/cpp/test/sg/rf_treelite_test.cu +++ b/cpp/test/sg/rf_treelite_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -190,7 +190,7 @@ class RfTreeliteTestCommon : public ::testing::TestWithParam> { params.min_impurity_decrease, params.bootstrap_features, params.split_criterion, false); set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.max_samples, -1, params.n_streams, tree_params); + params.max_samples, 0, params.n_streams, tree_params); handle.reset(new raft::handle_t(rf_params.n_streams)); data_len = params.n_rows * params.n_cols; diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 227ab56f41..c2b45cd8b4 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ import numpy as np import warnings from libcpp cimport bool -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, uint64_t from libc.stdlib cimport calloc, malloc, free from libcpp.vector cimport vector from libcpp.string cimport string @@ -115,7 +115,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": bool, int, float, - int, + uint64_t, CRITERION, bool, int, diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 457b0ec1e6..7945068ef3 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -1,6 +1,6 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ from cython.operator cimport dereference as deref from libcpp cimport bool from libcpp.vector cimport vector -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, uint64_t from libc.stdlib cimport calloc, malloc, free from numba import cuda @@ -230,7 +230,6 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): If set to true and following conditions are also met, experimental decision tree training implementation would be used: split_algo = 1 (GLOBAL_QUANTILE) - max_features = 1.0 (Feature sub-sampling disabled) quantile_per_tree = false (No per tree quantile computation) max_batch_size: int (default = 128) Maximum number of nodes that can be processed in a given batch. This is @@ -473,7 +472,7 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin): self.bootstrap, self.n_estimators, self.max_samples, - seed_val, + seed_val, self.split_criterion, self.quantile_per_tree, self.n_streams, diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 8e4301907b..0b63a1c46d 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ from cython.operator cimport dereference as deref from libcpp cimport bool from libcpp.vector cimport vector -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, uint64_t from libc.stdlib cimport calloc, malloc, free from numba import cuda @@ -222,7 +222,6 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): If set to true and following conditions are also met, experimental decision tree training implementation would be used: split_algo = 1 (GLOBAL_QUANTILE) - max_features = 1.0 (Feature sub-sampling disabled) quantile_per_tree = false (No per tree quantile computation) max_batch_size: int (default = 128) Maximum number of nodes that can be processed in a given batch. This is @@ -453,7 +452,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): self.bootstrap, self.n_estimators, self.max_samples, - seed_val, + seed_val, self.split_criterion, self.quantile_per_tree, self.n_streams, diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 119cf1c5de..f03d0b7d5c 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -173,10 +173,6 @@ def test_rf_classification(small_clf, datatype, split_algo, captured_stdout = f.getvalue() if use_experimental_backend: is_fallback_used = False - if max_features != 1.0: - assert ('Experimental backend does not yet support feature ' + - 'sub-sampling' in captured_stdout) - is_fallback_used = True if split_algo != 1: assert ('Experimental backend does not yet support histogram ' + 'split algorithm' in captured_stdout) From 28953ab8144dfd0d74f5c90540248a9e2705523c Mon Sep 17 00:00:00 2001 From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com> Date: Tue, 2 Feb 2021 12:17:54 -0700 Subject: [PATCH 04/29] Improve Python Docs with Default Role (#3445) This PR sets the default type of "interpolated text" in sphinx to `:py:obj:`. This is very useful for us since we frequently use a single backtick in our python documentation to refer to another python object. Currently, the docstring: ``` `cuml.datasets.make_blobs` ``` Would generate (italicized, variable spaced): ![image](https://user-images.githubusercontent.com/42954918/106529509-dd4c1900-64a7-11eb-9977-49a2594c7c3e.png) This PR changes it to (bold, mono spaced): ![image](https://user-images.githubusercontent.com/42954918/106529282-77f82800-64a7-11eb-86e2-a38e37ccea0d.png) The added benefit here is if the interpolated text is found in the index, it will link to that section. So in the above example, clicking on `cuml.datasets.make_blobs` will take you to the function documentation. Finally, this PR adds a new type of interpolated text role: `:py:` . This should be used for inline python code. For example, the following code: ``` * `cuml.cuml.datasets.make_blobs` for references do objects (functions, classes, modules, etc.) * :py:`import cupy as cp` for inline python code * ``import cupy as cp`` for literal code ``` will generate: ![image](https://user-images.githubusercontent.com/42954918/106530276-3f594e00-64a9-11eb-8edf-569fc9dd829e.png) I also looked for a few examples to replace to help seed usage of these new options. Updating every location would be very time consuming and is best done over time. Authors: - Michael Demoret (@mdemoret-nv) Approvers: - Dante Gama Dessavre (@dantegd) URL: https://github.com/rapidsai/cuml/pull/3445 --- docs/source/api.rst | 5 +++ docs/source/conf.py | 12 +++++-- python/cuml/common/input_utils.py | 6 ++-- python/cuml/dask/datasets/classification.py | 16 ++++----- python/cuml/datasets/blobs.py | 10 +++--- python/cuml/datasets/classification.py | 34 +++++++++--------- python/cuml/decomposition/incremental_pca.py | 36 ++++++++++---------- python/cuml/svm/svc.pyx | 2 +- 8 files changed, 67 insertions(+), 54 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 41f92d8a0a..c2dc582139 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -2,6 +2,11 @@ cuML API Reference ~~~~~~~~~~~~~~~~~~~ +.. role:: py(code) + :language: python + :class: highlight + + Module Configuration ==================== diff --git a/docs/source/conf.py b/docs/source/conf.py index 8bc89fa0de..296b4b6f36 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # This file is execfile()d with the current directory set to its # containing dir. @@ -182,7 +182,10 @@ ] # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} +intersphinx_mapping = { + "python": ('https://docs.python.org/', None), + "scipy": ('https://docs.scipy.org/doc/scipy/reference', None) +} # Config numpydoc numpydoc_show_inherited_class_members = False @@ -201,3 +204,8 @@ def setup(app): 'cuml', 'https://github.com/rapidsai/' 'cuml/blob/{revision}/python/' '{package}/{path}#L{lineno}') + +# Set the default role for interpreted code (anything surrounded in `single +# backticks`) to be a python object. See +# https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-default_role +default_role = "py:obj" \ No newline at end of file diff --git a/python/cuml/common/input_utils.py b/python/cuml/common/input_utils.py index b2549f591f..673a45232d 100644 --- a/python/cuml/common/input_utils.py +++ b/python/cuml/common/input_utils.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -85,8 +85,8 @@ def get_supported_input_type(X): ----- To closely match the functionality of :func:`~cuml.common.input_utils.input_to_cuml_array`, this method will - return ``cupy.ndarray`` for any object supporting - `__cuda_array_interface__` and ``numpy.ndarray`` for any object supporting + return `cupy.ndarray` for any object supporting + `__cuda_array_interface__` and `numpy.ndarray` for any object supporting `__array_interface__`. Returns diff --git a/python/cuml/dask/datasets/classification.py b/python/cuml/dask/datasets/classification.py index 0c06bf79eb..b7978b423d 100644 --- a/python/cuml/dask/datasets/classification.py +++ b/python/cuml/dask/datasets/classification.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,17 +46,17 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, This initially creates clusters of points normally distributed (std=1) about vertices of an `n_informative`-dimensional hypercube with sides of - length ``2 * class_sep`` and assigns an equal number of clusters to each + length :py:`2 * class_sep` and assigns an equal number of clusters to each class. It introduces interdependence between these features and adds various types of further noise to the data. - Without shuffling, ``X`` horizontally stacks features in the following + Without shuffling, `X` horizontally stacks features in the following order: the primary `n_informative` features, followed by `n_redundant` linear combinations of the informative features, followed by `n_repeated` duplicates, drawn randomly with replacement from the informative and redundant features. The remaining features are filled with random noise. Thus, without shuffling, all useful features are contained in the columns - ``X[:, :n_informative + n_redundant + n_repeated]``. + :py:`X[:, :n_informative + n_redundant + n_repeated]`. Examples -------- @@ -104,7 +104,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, The total number of features. These comprise `n_informative` informative features, `n_redundant` redundant features, `n_repeated` duplicated features and - ``n_features-n_informative-n_redundant-n_repeated`` useless features + :py:`n_features-n_informative-n_redundant-n_repeated` useless features drawn at random. n_informative : int, optional (default=2) The number of informative features. Each class is composed of a number @@ -124,10 +124,10 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, The number of classes (or labels) of the classification problem. n_clusters_per_class : int, optional (default=2) The number of clusters per class. - weights : array-like of shape ``(n_classes,)`` or ``(n_classes - 1,)``, \ - (default=None) + weights : array-like of shape :py:`(n_classes,)` or :py:`(n_classes - 1,)`\ + , (default=None) The proportions of samples assigned to each class. If None, then - classes are balanced. Note that if ``len(weights) == n_classes - 1``, + classes are balanced. Note that if :py:`len(weights) == n_classes - 1`, then the last class weight is automatically inferred. More than `n_samples` samples may be returned if the sum of `weights` exceeds 1. diff --git a/python/cuml/datasets/blobs.py b/python/cuml/datasets/blobs.py index ca3ebf415f..cbb6450e00 100644 --- a/python/cuml/datasets/blobs.py +++ b/python/cuml/datasets/blobs.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -80,12 +80,12 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, the number of samples per cluster. n_features : int, optional (default=2) The number of features for each sample. - centers : int or array of shape [n_centers, n_features], optional + centers : int or array of shape [`n_centers`, `n_features`], optional (default=None) The number of centers to generate, or the fixed center locations. - If n_samples is an int and centers is None, 3 centers are generated. - If n_samples is array-like, centers must be - either None or an array of length equal to the length of n_samples. + If `n_samples` is an int and centers is None, 3 centers are generated. + If `n_samples` is array-like, centers must be + either None or an array of length equal to the length of `n_samples`. cluster_std : float or sequence of floats, optional (default=1.0) The standard deviation of the clusters. center_box : pair of floats (min, max), optional (default=(-10.0, 10.0)) diff --git a/python/cuml/datasets/classification.py b/python/cuml/datasets/classification.py index 236d4dc526..c6e927228d 100644 --- a/python/cuml/datasets/classification.py +++ b/python/cuml/datasets/classification.py @@ -54,17 +54,17 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, """ Generate a random n-class classification problem. This initially creates clusters of points normally distributed (std=1) - about vertices of an ``n_informative``-dimensional hypercube with sides of - length ``2*class_sep`` and assigns an equal number of clusters to each + about vertices of an `n_informative`-dimensional hypercube with sides of + length :py:`2*class_sep` and assigns an equal number of clusters to each class. It introduces interdependence between these features and adds various types of further noise to the data. - Without shuffling, ``X`` horizontally stacks features in the following - order: the primary ``n_informative`` features, followed by ``n_redundant`` - linear combinations of the informative features, followed by ``n_repeated`` + Without shuffling, `X` horizontally stacks features in the following + order: the primary `n_informative` features, followed by `n_redundant` + linear combinations of the informative features, followed by `n_repeated` duplicates, drawn randomly with replacement from the informative and redundant features. The remaining features are filled with random noise. Thus, without shuffling, all useful features are contained in the columns - ``X[:, :n_informative + n_redundant + n_repeated]``. + :py:`X[:, :n_informative + n_redundant + n_repeated]`. Examples -------- @@ -106,15 +106,15 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, n_samples : int, optional (default=100) The number of samples. n_features : int, optional (default=20) - The total number of features. These comprise ``n_informative`` - informative features, ``n_redundant`` redundant features, - ``n_repeated`` duplicated features and - ``n_features-n_informative-n_redundant-n_repeated`` useless features + The total number of features. These comprise `n_informative` + informative features, `n_redundant` redundant features, + `n_repeated` duplicated features and + :py:`n_features-n_informative-n_redundant-n_repeated` useless features drawn at random. n_informative : int, optional (default=2) The number of informative features. Each class is composed of a number of gaussian clusters each located around the vertices of a hypercube - in a subspace of dimension ``n_informative``. For each cluster, + in a subspace of dimension `n_informative`. For each cluster, informative features are drawn independently from N(0, 1) and then randomly linearly combined within each cluster in order to add covariance. The clusters are then placed on the vertices of the @@ -132,10 +132,10 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, weights : array-like of shape (n_classes,) or (n_classes - 1,),\ (default=None) The proportions of samples assigned to each class. If None, then - classes are balanced. Note that if ``len(weights) == n_classes - 1``, + classes are balanced. Note that if :py:`len(weights) == n_classes - 1`, then the last class weight is automatically inferred. - More than ``n_samples`` samples may be returned if the sum of - ``weights`` exceeds 1. + More than `n_samples` samples may be returned if the sum of + `weights` exceeds 1. flip_y : float, optional (default=0.01) The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification @@ -188,7 +188,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, time for each feature class (informative, repeated, etc.) while also providing the added speedup of generating a big matrix on GPU - 2. We generate `order=F` construction. We exploit the + 2. We generate :py:`order=F` construction. We exploit the fact that X is a generated from a univariate normal, and covariance is introduced with matrix multiplications. Which means, we can generate X as a 1D array and just reshape it to the @@ -196,8 +196,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, copies 3. Lastly, we also shuffle by construction. Centroid indices are permuted for each sample, and then we construct the data for - each centroid. This shuffle works for both `order=C` and - `order=F` and eliminates any need for secondary copies + each centroid. This shuffle works for both :py:`order=C` and + :py:`order=F` and eliminates any need for secondary copies References ---------- diff --git a/python/cuml/decomposition/incremental_pca.py b/python/cuml/decomposition/incremental_pca.py index 4bad91e08c..3538c89ddc 100644 --- a/python/cuml/decomposition/incremental_pca.py +++ b/python/cuml/decomposition/incremental_pca.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -39,15 +39,15 @@ class IncrementalPCA(PCA): Depending on the size of the input data, this algorithm can be much more memory efficient than a PCA, and allows sparse input. This algorithm has constant memory complexity, on the order of - ``batch_size * n_features``, enabling use of np.memmap files without + :py:`batch_size * n_features`, enabling use of np.memmap files without loading the entire file into memory. For sparse matrices, the input is converted to dense in batches (in order to be able to subtract the mean) which avoids storing the entire dense matrix at any one time. The computational overhead of each SVD is - ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples - remain in memory at a time. There will be ``n_samples / batch_size`` + :py:`O(batch_size * n_features ** 2)`, but only 2 * batch_size samples + remain in memory at a time. There will be :py:`n_samples / batch_size` SVD computations to get the principal components, versus 1 large SVD - of complexity ``O(n_samples * n_features ** 2)`` for PCA. + of complexity :py:`O(n_samples * n_features ** 2)` for PCA. Parameters ---------- @@ -60,8 +60,8 @@ class IncrementalPCA(PCA): handles in several streams. If it is None, a new one is created. n_components : int or None, (default=None) - Number of components to keep. If ``n_components`` is ``None``, - then ``n_components`` is set to ``min(n_samples, n_features)``. + Number of components to keep. If `n_components` is ``None``, + then `n_components` is set to :py:`min(n_samples, n_features)`. whiten : bool, optional If True, de-correlates the components. This is done by dividing them by the corresponding singular values then multiplying by sqrt(n_samples). @@ -69,12 +69,12 @@ class IncrementalPCA(PCA): multi-collinearity. It might be beneficial for downstream tasks like LinearRegression where correlated features cause problems. copy : bool, (default=True) - If False, X will be overwritten. ``copy=False`` can be used to + If False, X will be overwritten. :py:`copy=False` can be used to save memory but is unsafe for general use. batch_size : int or None, (default=None) The number of samples to use for each batch. Only used when calling - ``fit``. If ``batch_size`` is ``None``, then ``batch_size`` - is inferred from the data and set to ``5 * n_features``, to provide a + `fit`. If `batch_size` is ``None``, then `batch_size` + is inferred from the data and set to :py:`5 * n_features`, to provide a balance between approximation accuracy and memory consumption. verbose : int or boolean, default=False Sets logging level. It must be one of `cuml.common.logger.level_*`. @@ -98,24 +98,24 @@ class IncrementalPCA(PCA): to 1.0. singular_values_ : array, shape (n_components,) The singular values corresponding to each of the selected components. - The singular values are equal to the 2-norms of the ``n_components`` + The singular values are equal to the 2-norms of the `n_components` variables in the lower-dimensional space. mean_ : array, shape (n_features,) - Per-feature empirical mean, aggregate over calls to ``partial_fit``. + Per-feature empirical mean, aggregate over calls to `partial_fit`. var_ : array, shape (n_features,) Per-feature empirical variance, aggregate over calls to - ``partial_fit``. + `partial_fit`. noise_variance_ : float The estimated noise covariance following the Probabilistic PCA model from [4]_. n_components_ : int The estimated number of components. Relevant when - ``n_components=None``. + `n_components=None`. n_samples_seen_ : int The number of samples processed by the estimator. Will be reset on - new calls to fit, but increments across ``partial_fit`` calls. + new calls to fit, but increments across `partial_fit` calls. batch_size_ : int - Inferred batch size from ``batch_size``. + Inferred batch size from `batch_size`. Notes ----- @@ -126,8 +126,8 @@ class IncrementalPCA(PCA): decomposition used in specific situations to reduce the algorithmic complexity of the SVD. The source for this technique is [3]_. This technique has been omitted because it is advantageous only when decomposing - a matrix with ``n_samples >= 5/3 * n_features`` where ``n_samples`` and - ``n_features`` are the matrix rows and columns, respectively. In addition, + a matrix with :py:`n_samples >= 5/3 * n_features` where `n_samples` and + `n_features` are the matrix rows and columns, respectively. In addition, it hurts the readability of the implemented algorithm. This would be a good opportunity for future optimization, if it is deemed necessary. diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx index 596908d086..04d30aceb8 100644 --- a/python/cuml/svm/svc.pyx +++ b/python/cuml/svm/svc.pyx @@ -216,7 +216,7 @@ class SVC(SVMBase, ClassifierMixin): coef_ : float, shape (1, n_cols) Only available for linear kernels. It is the normal of the hyperplane. - ``coef_ = sum_k=1..n_support dual_coef_[k] * support_vectors[k,:]`` + coef_ = sum_k=1..n_support dual_coef_[k] * support_vectors[k,:] classes_: shape (n_classes_,) Array of class labels n_classes_ : int From 8201a33657697a2ef3c8f8587322797c63e2c83b Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 2 Feb 2021 20:56:09 +0100 Subject: [PATCH 05/29] MNMG DBSCAN (#3382) This Pull Request adds initial support for multi-node multi-GPU DBSCAN, and fixes the bugs identified in #3094. It works by copying the dataset on all the workers and giving ownership of a subset of points to each one. The workers compute a partial clustering with the knowledge of the relationships between their points and the rest of the dataset, and the partial clusterings are merged to form the final labeling. This merging algorithm is also used to accumulate the results in case a batch-wise approach is used on a worker to limit the memory consumption. The multi-GPU implementation gives great speedups for large datasets, while for small datasets the performance is dominated by the Dask launch overhead, as shown in the figure below: ![mnmg_dbscan_perf](https://user-images.githubusercontent.com/17441062/104958437-55a6da80-59d0-11eb-8a18-fcca0d69c41b.png) Notes: - I have renamed variables in the DBSCAN implementation to match our style conventions (snake case). Sorry for the noise that it adds to this PR. - I refactored some CSR tests to accept multiple test cases instead of hardcoded ones, in order to add corner cases to weak CC. PR #3157 by @cjnolet changed the location of these tests, so I moved those that I had already refactored accordingly. At the moment only the tests that were in `cpp/test/prims/csr.cu` previously have been refactored. I was thinking that the others can be refactored later, I'd like @cjnolet's opinion on this refactoring. - Regarding testing, the MNMG tests are mostly a copy of the single-GPU ones, though I removed a few tests with very small datasets to avoid problems with MNMG (it doesn't really support the edge case where a worker owns 0 sample, as I think it's a fair assumption that MNMG DBSCAN isn't used with such a tiny dataset). - Also regarding tests, I changed the comparison function to account for the fact that border points are ambiguous. It assumes that the labeling of core points is minimal in both our implementation and the reference, so if this assumption changes we will need to update the tests accordingly. If you want to access a pseudo-code description and proof of the new algorithm, feel free to contact me. Tagging people to whom this PR is relevant: @teju85 @tfeher @MatthiasKohl @canonizer Authors: - Louis Sugy (@Nyrio) Approvers: - Tamas Bela Feher (@tfeher) - Corey J. Nolet (@cjnolet) URL: https://github.com/rapidsai/cuml/pull/3382 --- cpp/CMakeLists.txt | 2 +- cpp/bench/sg/dbscan.cu | 10 +- cpp/examples/dbscan/dbscan_example.cpp | 6 +- cpp/include/cuml/cluster/dbscan.hpp | 39 +-- cpp/src/dbscan/adjgraph/algo.cuh | 25 +- cpp/src/dbscan/adjgraph/naive.cuh | 32 +- cpp/src/dbscan/adjgraph/pack.h | 10 +- cpp/src/dbscan/adjgraph/runner.cuh | 15 +- cpp/src/dbscan/corepoints/compute.cuh | 50 ++++ cpp/src/dbscan/corepoints/exchange.cuh | 59 ++++ cpp/src/dbscan/dbscan.cu | 115 ++++---- cpp/src/dbscan/dbscan.cuh | 140 +++++---- cpp/src/dbscan/dbscan_api.cpp | 10 +- cpp/src/dbscan/dbscan_api.h | 2 +- cpp/src/dbscan/mergelabels/runner.cuh | 47 +++ cpp/src/dbscan/mergelabels/tree_reduction.cuh | 82 ++++++ cpp/src/dbscan/runner.cuh | 274 +++++++++++------- cpp/src/dbscan/vertexdeg/algo.cuh | 15 +- cpp/src/dbscan/vertexdeg/naive.cuh | 34 ++- cpp/src/dbscan/vertexdeg/pack.h | 6 +- cpp/src/dbscan/vertexdeg/runner.cuh | 15 +- cpp/src_prims/label/merge_labels.cuh | 146 ++++++++++ cpp/src_prims/sparse/csr.cuh | 247 ++++++---------- cpp/test/CMakeLists.txt | 1 + cpp/test/prims/csr.cu | 243 +++++++++++----- cpp/test/prims/csr.h | 34 --- cpp/test/prims/merge_labels.cu | 147 ++++++++++ cpp/test/prims/sparse/add.cu | 219 ++++++++------ cpp/test/prims/sparse/convert_coo.cu | 93 +++--- cpp/test/prims/sparse/convert_csr.cu | 110 +++++-- cpp/test/prims/sparse/norm.cu | 171 +++++------ cpp/test/prims/sparse/row_op.cu | 116 +++++--- cpp/test/sg/dbscan_test.cu | 15 +- docs/source/api.rst | 6 + python/cuml/cluster/__init__.py | 2 +- python/cuml/cluster/dbscan.pyx | 216 +++++++------- python/cuml/cluster/dbscan_mg.pyx | 45 +++ python/cuml/dask/cluster/__init__.py | 3 +- python/cuml/dask/cluster/dbscan.py | 158 ++++++++++ python/cuml/test/dask/test_dbscan.py | 215 ++++++++++++++ python/cuml/test/test_dbscan.py | 149 ++++++---- python/cuml/test/utils.py | 39 +++ 42 files changed, 2303 insertions(+), 1060 deletions(-) create mode 100644 cpp/src/dbscan/corepoints/compute.cuh create mode 100644 cpp/src/dbscan/corepoints/exchange.cuh create mode 100644 cpp/src/dbscan/mergelabels/runner.cuh create mode 100644 cpp/src/dbscan/mergelabels/tree_reduction.cuh create mode 100644 cpp/src_prims/label/merge_labels.cuh delete mode 100644 cpp/test/prims/csr.h create mode 100644 cpp/test/prims/merge_labels.cu create mode 100644 python/cuml/cluster/dbscan_mg.pyx create mode 100644 python/cuml/dask/cluster/dbscan.py create mode 100644 python/cuml/test/dask/test_dbscan.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d6cf851aef..2eacef46eb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/cpp/bench/sg/dbscan.cu b/cpp/bench/sg/dbscan.cu index 948dca6a74..44818eb959 100644 --- a/cpp/bench/sg/dbscan.cu +++ b/cpp/bench/sg/dbscan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,10 +51,10 @@ class Dbscan : public BlobsFixture { state.SkipWithError("Dbscan only supports row-major inputs"); } this->loopOnState(state, [this, &state]() { - dbscanFit(*this->handle, this->data.X, this->params.nrows, - this->params.ncols, D(dParams.eps), dParams.min_pts, - this->data.y, this->core_sample_indices, - dParams.max_bytes_per_batch); + ML::Dbscan::fit(*this->handle, this->data.X, this->params.nrows, + this->params.ncols, D(dParams.eps), dParams.min_pts, + this->data.y, this->core_sample_indices, + dParams.max_bytes_per_batch); state.SetItemsProcessed(this->params.nrows * this->params.ncols); }); } diff --git a/cpp/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp index 273d1fa71e..37722b65d6 100644 --- a/cpp/examples/dbscan/dbscan_example.cpp +++ b/cpp/examples/dbscan/dbscan_example.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -200,8 +200,8 @@ int main(int argc, char* argv[]) { << "eps - " << eps << std::endl << "max_bytes_per_batch - " << max_bytes_per_batch << std::endl; - ML::dbscanFit(handle, d_inputData, nRows, nCols, eps, minPts, d_labels, - nullptr, max_bytes_per_batch, false); + ML::Dbscan::fit(handle, d_inputData, nRows, nCols, eps, minPts, d_labels, + nullptr, max_bytes_per_batch, false); CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_RT_CALL(cudaStreamSynchronize(stream)); diff --git a/cpp/include/cuml/cluster/dbscan.hpp b/cpp/include/cuml/cluster/dbscan.hpp index e1a1dbe350..710a7aa423 100644 --- a/cpp/include/cuml/cluster/dbscan.hpp +++ b/cpp/include/cuml/cluster/dbscan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include namespace ML { +namespace Dbscan { /** * @defgroup DbscanCpp C++ implementation of Dbscan algo @@ -39,28 +40,30 @@ namespace ML { * @param[in] max_bytes_per_batch the maximum number of megabytes to be used for * each batch of the pairwise distance calculation. This enables the * trade off between memory usage and algorithm execution time. - * @param[in] verbosity: verbosity level for logging messages during execution + * @param[in] verbosity verbosity level for logging messages during execution + * @param[in] opg whether we are running in a multi-node multi-GPU context * @{ */ -void dbscanFit(const raft::handle_t &handle, float *input, int n_rows, - int n_cols, float eps, int min_pts, int *labels, - int *core_sample_indices = nullptr, - size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); -void dbscanFit(const raft::handle_t &handle, double *input, int n_rows, - int n_cols, double eps, int min_pts, int *labels, - int *core_sample_indices = nullptr, - size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); +void fit(const raft::handle_t &handle, float *input, int n_rows, int n_cols, + float eps, int min_pts, int *labels, + int *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, + int verbosity = CUML_LEVEL_INFO, bool opg = false); +void fit(const raft::handle_t &handle, double *input, int n_rows, int n_cols, + double eps, int min_pts, int *labels, + int *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, + int verbosity = CUML_LEVEL_INFO, bool opg = false); -void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows, - int64_t n_cols, float eps, int min_pts, int64_t *labels, - int64_t *core_sample_indices = nullptr, - size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); -void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows, - int64_t n_cols, double eps, int min_pts, int64_t *labels, - int64_t *core_sample_indices = nullptr, - size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO); +void fit(const raft::handle_t &handle, float *input, int64_t n_rows, + int64_t n_cols, float eps, int min_pts, int64_t *labels, + int64_t *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, + int verbosity = CUML_LEVEL_INFO, bool opg = false); +void fit(const raft::handle_t &handle, double *input, int64_t n_rows, + int64_t n_cols, double eps, int min_pts, int64_t *labels, + int64_t *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0, + int verbosity = CUML_LEVEL_INFO, bool opg = false); /** @} */ +} // namespace Dbscan } // namespace ML diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh index 24a9f3f720..163d943377 100644 --- a/cpp/src/dbscan/adjgraph/algo.cuh +++ b/cpp/src/dbscan/adjgraph/algo.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ using namespace thrust; +namespace ML { namespace Dbscan { namespace AdjGraph { namespace Algo { @@ -38,30 +39,21 @@ static const int TPB_X = 256; /** * Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the - * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts. + * CSR row_ind_ptr array (adj_graph) */ template -void launcher(const raft::handle_t &handle, Pack data, Index_ batchSize, - cudaStream_t stream) { +void launcher(const raft::handle_t &handle, Pack data, + Index_ batch_size, cudaStream_t stream) { device_ptr dev_vd = device_pointer_cast(data.vd); device_ptr dev_ex_scan = device_pointer_cast(data.ex_scan); ML::thrustAllocatorAdapter alloc(handle.get_device_allocator(), stream); exclusive_scan(thrust::cuda::par(alloc).on(stream), dev_vd, - dev_vd + batchSize, dev_ex_scan); - - bool *core_pts = data.core_pts; - int minPts = data.minPts; - Index_ *vd = data.vd; + dev_vd + batch_size, dev_ex_scan); raft::sparse::convert::csr_adj_graph_batched( - data.ex_scan, data.N, data.adjnnz, batchSize, data.adj, data.adj_graph, - stream, - [core_pts, minPts, vd] __device__(Index_ row, Index_ start_idx, - Index_ stop_idx) { - // fuse the operation of core points construction - core_pts[row] = (vd[row] >= minPts); - }); + data.ex_scan, data.N, data.adjnnz, batch_size, data.adj, data.adj_graph, + stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -69,3 +61,4 @@ void launcher(const raft::handle_t &handle, Pack data, Index_ batchSize, } // namespace Algo } // namespace AdjGraph } // namespace Dbscan +} // namespace ML \ No newline at end of file diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh index ae44f5a59e..20d0175863 100644 --- a/cpp/src/dbscan/adjgraph/naive.cuh +++ b/cpp/src/dbscan/adjgraph/naive.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,47 +23,45 @@ #include "../common.cuh" #include "pack.h" +namespace ML { namespace Dbscan { namespace AdjGraph { namespace Naive { template -void launcher(const raft::handle_t& handle, Pack data, Index_ batchSize, - cudaStream_t stream) { +void launcher(const raft::handle_t& handle, Pack data, + Index_ batch_size, cudaStream_t stream) { Index_ k = 0; Index_ N = data.N; MLCommon::host_buffer host_vd(handle.get_host_allocator(), stream, - batchSize + 1); - MLCommon::host_buffer host_core_pts(handle.get_host_allocator(), stream, - batchSize); + batch_size + 1); MLCommon::host_buffer host_adj(handle.get_host_allocator(), stream, - batchSize * N); + batch_size * N); MLCommon::host_buffer host_ex_scan(handle.get_host_allocator(), - stream, batchSize); - raft::update_host(host_adj.data(), data.adj, batchSize * N, stream); - raft::update_host(host_vd.data(), data.vd, batchSize + 1, stream); + stream, batch_size); + raft::update_host(host_adj.data(), data.adj, batch_size * N, stream); + raft::update_host(host_vd.data(), data.vd, batch_size + 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - size_t adjgraph_size = size_t(host_vd[batchSize]); + size_t adjgraph_size = size_t(host_vd[batch_size]); MLCommon::host_buffer host_adj_graph(handle.get_host_allocator(), stream, adjgraph_size); - for (Index_ i = 0; i < batchSize; i++) { + for (Index_ i = 0; i < batch_size; i++) { for (Index_ j = 0; j < N; j++) { + /// TODO: change layout or remove; cf #3414 if (host_adj[i * N + j]) { host_adj_graph[k] = j; k = k + 1; } } } - for (Index_ i = 0; i < batchSize; i++) - host_core_pts[i] = (host_vd[i] >= data.minPts); host_ex_scan[0] = Index_(0); - for (Index_ i = 1; i < batchSize; i++) + for (Index_ i = 1; i < batch_size; i++) host_ex_scan[i] = host_ex_scan[i - 1] + host_vd[i - 1]; raft::update_device(data.adj_graph, host_adj_graph.data(), adjgraph_size, stream); - raft::update_device(data.core_pts, host_core_pts.data(), batchSize, stream); - raft::update_device(data.ex_scan, host_ex_scan.data(), batchSize, stream); + raft::update_device(data.ex_scan, host_ex_scan.data(), batch_size, stream); } } // namespace Naive } // namespace AdjGraph } // namespace Dbscan +} // namespace ML \ No newline at end of file diff --git a/cpp/src/dbscan/adjgraph/pack.h b/cpp/src/dbscan/adjgraph/pack.h index 83bf85c943..4e6eafe101 100644 --- a/cpp/src/dbscan/adjgraph/pack.h +++ b/cpp/src/dbscan/adjgraph/pack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +namespace ML { namespace Dbscan { namespace AdjGraph { @@ -36,13 +37,10 @@ struct Pack { /** exculusive scan generated from vd */ Index_ *ex_scan; - /** array to store whether a vertex is core poType or not */ - bool *core_pts; - /** number of poTypes in the dataset */ + /** number of points in the dataset */ Index_ N; - /** Minpts for classifying core pts */ - Index_ minPts; }; } // namespace AdjGraph } // namespace Dbscan +} // namespace ML diff --git a/cpp/src/dbscan/adjgraph/runner.cuh b/cpp/src/dbscan/adjgraph/runner.cuh index 90122a3bd5..c9298c7cea 100644 --- a/cpp/src/dbscan/adjgraph/runner.cuh +++ b/cpp/src/dbscan/adjgraph/runner.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,21 +21,21 @@ #include "naive.cuh" #include "pack.h" +namespace ML { namespace Dbscan { namespace AdjGraph { template void run(const raft::handle_t& handle, bool* adj, Index_* vd, Index_* adj_graph, - Index_ adjnnz, Index_* ex_scan, Index_ N, Index_ minpts, - bool* core_pts, int algo, Index_ batchSize, cudaStream_t stream) { - Pack data = {vd, adj, adj_graph, adjnnz, - ex_scan, core_pts, N, minpts}; + Index_ adjnnz, Index_* ex_scan, Index_ N, int algo, Index_ batch_size, + cudaStream_t stream) { + Pack data = {vd, adj, adj_graph, adjnnz, ex_scan, N}; switch (algo) { case 0: - Naive::launcher(handle, data, batchSize, stream); + Naive::launcher(handle, data, batch_size, stream); break; case 1: - Algo::launcher(handle, data, batchSize, stream); + Algo::launcher(handle, data, batch_size, stream); break; default: ASSERT(false, "Incorrect algo passed! '%d'", algo); @@ -44,3 +44,4 @@ void run(const raft::handle_t& handle, bool* adj, Index_* vd, Index_* adj_graph, } // namespace AdjGraph } // namespace Dbscan +} // namespace ML \ No newline at end of file diff --git a/cpp/src/dbscan/corepoints/compute.cuh b/cpp/src/dbscan/corepoints/compute.cuh new file mode 100644 index 0000000000..6de9e3c957 --- /dev/null +++ b/cpp/src/dbscan/corepoints/compute.cuh @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace ML { +namespace Dbscan { +namespace CorePoints { + +/** + * Compute the core points from the vertex degrees and min_pts criterion + * @param[in] handle cuML handle + * @param[in] vd Vertex degrees + * @param[out] mask Boolean core point mask + * @param[in] min_pts Core point criterion + * @param[in] start_vertex_id First point of the batch + * @param[in] batch_size Batch size + * @param[in] stream CUDA stream + */ +template +void compute(const raft::handle_t& handle, const Index_* vd, bool* mask, + Index_ min_pts, Index_ start_vertex_id, Index_ batch_size, + cudaStream_t stream) { + auto execution_policy = + ML::thrust_exec_policy(handle.get_device_allocator(), stream); + auto counting = thrust::make_counting_iterator(0); + thrust::for_each(execution_policy->on(stream), counting, + counting + batch_size, [=] __device__(Index_ idx) { + mask[idx + start_vertex_id] = vd[idx] >= min_pts; + }); +} + +} // namespace CorePoints +} // namespace Dbscan +} // namespace ML diff --git a/cpp/src/dbscan/corepoints/exchange.cuh b/cpp/src/dbscan/corepoints/exchange.cuh new file mode 100644 index 0000000000..6a4883b1a1 --- /dev/null +++ b/cpp/src/dbscan/corepoints/exchange.cuh @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace ML { +namespace Dbscan { +namespace CorePoints { + +/** + * Compute the core points from the vertex degrees and min_pts criterion + * @param[in] handle cuML handle + * @param[out] mask Boolean core point mask + * @param[in] N Number of points + * @param[in] start_row Offset for this node + * @param[in] stream CUDA stream + */ +template +void exchange(const raft::handle_t& handle, bool* mask, Index_ N, + Index_ start_row, cudaStream_t stream) { + const auto& comm = handle.get_comms(); + int my_rank = comm.get_rank(); + int n_rank = comm.get_size(); + + // Array with the size of the contribution of each worker + Index_ rows_per_rank = raft::ceildiv(N, n_rank); + std::vector recvcounts = std::vector(n_rank, rows_per_rank); + recvcounts[n_rank - 1] = N - (n_rank - 1) * rows_per_rank; + + // Array with the displacement of each part + std::vector displs = std::vector(n_rank); + for (int i = 0; i < n_rank; i++) displs[i] = i * rows_per_rank; + + // All-gather operation with variable contribution length + comm.allgatherv((char*)mask + start_row, (char*)mask, recvcounts.data(), + displs.data(), stream); + ASSERT(comm.sync_stream(stream) == raft::comms::status_t::SUCCESS, + "An error occurred in the distributed operation. This can result from " + "a failed rank"); +} + +} // namespace CorePoints +} // namespace Dbscan +} // namespace ML diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu index 7f8ae9e286..a70b31ae83 100644 --- a/cpp/src/dbscan/dbscan.cu +++ b/cpp/src/dbscan/dbscan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,78 +18,63 @@ #include #include #include "dbscan.cuh" -#include "runner.cuh" namespace ML { +namespace Dbscan { -using namespace Dbscan; - -void dbscanFit(const raft::handle_t &handle, float *input, int n_rows, - int n_cols, float eps, int min_pts, int *labels, - size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, labels, - nullptr, max_bytes_per_batch, handle.get_stream(), - verbosity); -} - -void dbscanFit(const raft::handle_t &handle, double *input, int n_rows, - int n_cols, double eps, int min_pts, int *labels, - size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, - labels, nullptr, max_bytes_per_batch, - handle.get_stream(), verbosity); -} - -void dbscanFit(const raft::handle_t &handle, float *input, int n_rows, - int n_cols, float eps, int min_pts, int *labels, - int *core_sample_indices, size_t max_bytes_per_batch, - int verbosity) { - dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, - handle.get_stream(), verbosity); -} - -void dbscanFit(const raft::handle_t &handle, double *input, int n_rows, - int n_cols, double eps, int min_pts, int *labels, - int *core_sample_indices, size_t max_bytes_per_batch, - int verbosity) { - dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, - labels, core_sample_indices, max_bytes_per_batch, - handle.get_stream(), verbosity); -} - -void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows, - int64_t n_cols, float eps, int min_pts, int64_t *labels, - size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, - labels, nullptr, max_bytes_per_batch, - handle.get_stream(), verbosity); +void fit(const raft::handle_t &handle, float *input, int n_rows, int n_cols, + float eps, int min_pts, int *labels, int *core_sample_indices, + size_t max_bytes_per_batch, int verbosity, bool opg) { + if (opg) + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); + else + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); } -void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows, - int64_t n_cols, double eps, int min_pts, int64_t *labels, - size_t max_bytes_per_batch, int verbosity) { - dbscanFitImpl(handle, input, n_rows, n_cols, eps, min_pts, - labels, nullptr, max_bytes_per_batch, - handle.get_stream(), verbosity); +void fit(const raft::handle_t &handle, double *input, int n_rows, int n_cols, + double eps, int min_pts, int *labels, int *core_sample_indices, + size_t max_bytes_per_batch, int verbosity, bool opg) { + if (opg) + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); + else + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); } -void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows, - int64_t n_cols, float eps, int min_pts, int64_t *labels, - int64_t *core_sample_indices, size_t max_bytes_per_batch, - int verbosity) { - dbscanFitImpl( - handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, - max_bytes_per_batch, handle.get_stream(), verbosity); +void fit(const raft::handle_t &handle, float *input, int64_t n_rows, + int64_t n_cols, float eps, int min_pts, int64_t *labels, + int64_t *core_sample_indices, size_t max_bytes_per_batch, + int verbosity, bool opg) { + if (opg) + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); + else + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); } -void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows, - int64_t n_cols, double eps, int min_pts, int64_t *labels, - int64_t *core_sample_indices, size_t max_bytes_per_batch, - int verbosity) { - dbscanFitImpl( - handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, - max_bytes_per_batch, handle.get_stream(), verbosity); +void fit(const raft::handle_t &handle, double *input, int64_t n_rows, + int64_t n_cols, double eps, int min_pts, int64_t *labels, + int64_t *core_sample_indices, size_t max_bytes_per_batch, + int verbosity, bool opg) { + if (opg) + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); + else + dbscanFitImpl( + handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, + max_bytes_per_batch, handle.get_stream(), verbosity); } -}; // end namespace ML +} // namespace Dbscan +} // namespace ML diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh index af6420f1d1..6c8b600e0f 100644 --- a/cpp/src/dbscan/dbscan.cuh +++ b/cpp/src/dbscan/dbscan.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,14 +25,13 @@ namespace ML { -using namespace Dbscan; // Default max mem set to a reasonable value for a 16gb card. static const size_t DEFAULT_MAX_MEM_MBYTES = 13e3; template -Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows, - size_t max_mbytes_per_batch = 0, - Index_ neigh_per_row = 0) { +Index_ compute_batch_size(size_t &estimated_memory, Index_ n_rows, + Index_ n_owned_rows, size_t max_mbytes_per_batch = 0, + Index_ neigh_per_row = 0) { // In real applications, it's unlikely that the sparse adjacency matrix // comes even close to the worst-case memory usage, because if epsilon // is so large that all points are connected to 10% or even more of other @@ -41,16 +40,21 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows, if (neigh_per_row <= 0) neigh_per_row = n_rows; - // we'll estimate the memory consumption per row. - // First the dense adjacency matrix - estimated_memory = n_rows * sizeof(bool); - // sparse adjacency matrix - estimated_memory += neigh_per_row * sizeof(Index_); - // core points and two indicator variables - estimated_memory += 3 * sizeof(bool); - // the rest will be so small that it should fit into what we have left over + /* Memory needed per batch row: + * - Dense adj matrix: n_rows (bool) + * - Sparse adj matrix: neigh_per_row (Index_) + * - Vertex degrees: 1 (Index_) + * - Ex scan: 1 (Index_) + */ + size_t est_mem_per_row = + n_rows * sizeof(bool) + (neigh_per_row + 2) * sizeof(Index_); + /* Memory needed regardless of the batch size: + * - Temporary labels: n_rows (Index_) + * - Core point mask: n_rows (bool) + */ + size_t est_mem_fixed = n_rows * (sizeof(Index_) + sizeof(bool)); + // The rest will be so small that it should fit into what we have left over // from the over-estimation of the sparse adjacency matrix - estimated_memory *= n_rows; if (max_mbytes_per_batch <= 0) { /* using default here as in decision tree, waiting for mem info from device allocator @@ -60,69 +64,93 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows, max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES; } - Index_ nBatches = (Index_)raft::ceildiv( - estimated_memory, max_mbytes_per_batch * 1000000); + // Batch size determined based on available memory + Index_ batch_size = + (max_mbytes_per_batch * 1000000 - est_mem_fixed) / est_mem_per_row; + + // Limit batch size to number of owned rows + batch_size = std::min(n_owned_rows, batch_size); + + // To avoid overflow, we need: batch_size <= MAX_LABEL / n_rows (floor div) Index_ MAX_LABEL = std::numeric_limits::max(); - // to avoid overflow, we need: batch_size <= MAX_LABEL / n_rows (floor div) - // -> num_batches >= raft::ceildiv(n_rows / (MAX_LABEL / n_rows)) - Index_ nBatchesPrec = raft::ceildiv(n_rows, MAX_LABEL / n_rows); - // at some point, if nBatchesPrec is larger than nBatches - // (or larger by a given factor) and we know that there are clear - // performance benefits of using a smaller number of batches, - // we should probably warn the user. - // In the latest benchmarks, it seems like using int64 indexing and batches - // that are much larger than 2.10^9 points (the limit for int32), doesn't - // actually improve performance, even when using >16.10^9 points per batch. - // Much larger batches than 16.10^9 do not currently fit on GPU architectures + if (batch_size > MAX_LABEL / n_rows) { + Index_ new_batch_size = MAX_LABEL / n_rows; + CUML_LOG_WARN( + "Batch size limited by the chosen integer type (%d bytes). %d -> %d. " + "Using the larger integer type might result in better performance", + (int)sizeof(Index_), (int)batch_size, (int)new_batch_size); + batch_size = new_batch_size; + } + + // Warn when a smaller index type could be used if (sizeof(Index_) > sizeof(int) && - (size_t)n_rows * raft::ceildiv(n_rows, nBatches) < - std::numeric_limits::max()) { + batch_size < std::numeric_limits::max() / n_rows) { CUML_LOG_WARN( "You are using an index type of size (%d bytes) but a smaller index " - "type (%d bytes) would be sufficient. Consider using the smaller " - "index type for better performance.", + "type (%d bytes) would be sufficient. Using the smaller integer type " + "might result in better performance.", (int)sizeof(Index_), (int)sizeof(int)); } - if (nBatchesPrec > nBatches) { - nBatches = nBatchesPrec; - // we have to re-adjust memory estimation here - estimated_memory = nBatches * (estimated_memory / n_rows); - } - return max((Index_)1, nBatches); + + estimated_memory = batch_size * est_mem_per_row + est_mem_fixed; + return batch_size; } -template +template void dbscanFitImpl(const raft::handle_t &handle, T *input, Index_ n_rows, Index_ n_cols, T eps, Index_ min_pts, Index_ *labels, Index_ *core_sample_indices, size_t max_mbytes_per_batch, cudaStream_t stream, int verbosity) { ML::PUSH_RANGE("ML::Dbscan::Fit"); ML::Logger::get().setLevel(verbosity); - int algoVd = 1; - int algoAdj = 1; - int algoCcl = 2; + int algo_vd = 1; + int algo_adj = 1; + int algo_ccl = 2; + + int my_rank, n_rank; + Index_ start_row, n_owned_rows; + if (opg) { + const auto &comm = handle.get_comms(); + my_rank = comm.get_rank(); + n_rank = comm.get_size(); + Index_ rows_per_rank = raft::ceildiv(n_rows, n_rank); + start_row = my_rank * rows_per_rank; + Index_ end_row = min((my_rank + 1) * rows_per_rank, n_rows); + n_owned_rows = max(Index_(0), end_row - start_row); + // Note: it is possible for a node to have no work in theory. It won't + // happen in practice (because n_rows is much greater than n_rank) + } else { + my_rank = 0; + n_rank = 1; + n_owned_rows = n_rows; + } + + CUML_LOG_DEBUG("#%d owns %ld rows", (int)my_rank, + (unsigned long)n_owned_rows); - ///@todo: Query device for remaining memory + /// TODO: Query device for remaining memory size_t estimated_memory; - Index_ n_batches = - computeBatchCount(estimated_memory, n_rows, max_mbytes_per_batch); + Index_ batch_size = compute_batch_size( + estimated_memory, n_rows, n_owned_rows, max_mbytes_per_batch); - if (n_batches > 1) { - CUML_LOG_DEBUG("Running batched training on %ld batches w/ %lf MB", - (unsigned long)n_batches, - (double)estimated_memory * 1e-6 / n_batches); - } + CUML_LOG_DEBUG( + "Running batched training (batch size: %ld, estimated: %lf MB)", + (unsigned long)batch_size, (double)estimated_memory * 1e-6); + + size_t workspaceSize = Dbscan::run( + handle, input, n_rows, n_cols, start_row, n_owned_rows, eps, min_pts, + labels, core_sample_indices, algo_vd, algo_adj, algo_ccl, NULL, batch_size, + stream); - size_t workspaceSize = Dbscan::run( - handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices, - algoVd, algoAdj, algoCcl, NULL, n_batches, stream); + CUML_LOG_DEBUG("Workspace size: %lf MB", (double)workspaceSize * 1e-6); MLCommon::device_buffer workspace(handle.get_device_allocator(), stream, workspaceSize); - Dbscan::run(handle, input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, algoVd, algoAdj, algoCcl, workspace.data(), - n_batches, stream); + Dbscan::run(handle, input, n_rows, n_cols, start_row, + n_owned_rows, eps, min_pts, labels, + core_sample_indices, algo_vd, algo_adj, algo_ccl, + workspace.data(), batch_size, stream); ML::POP_RANGE(); } -}; // namespace ML +} // namespace ML diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp index 15cb1bf684..a7570d3da5 100644 --- a/cpp/src/dbscan/dbscan_api.cpp +++ b/cpp/src/dbscan/dbscan_api.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,8 +28,8 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows, std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { - ML::dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, verbosity); + ML::Dbscan::fit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, + core_sample_indices, max_bytes_per_batch, verbosity); } //TODO: Implement this //catch (const MLCommon::Exception& e) @@ -53,8 +53,8 @@ cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows, std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle); if (status == CUML_SUCCESS) { try { - ML::dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, - core_sample_indices, max_bytes_per_batch, verbosity); + ML::Dbscan::fit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels, + core_sample_indices, max_bytes_per_batch, verbosity); } //TODO: Implement this //catch (const MLCommon::Exception& e) diff --git a/cpp/src/dbscan/dbscan_api.h b/cpp/src/dbscan/dbscan_api.h index e27f41ba7d..e7877200a2 100644 --- a/cpp/src/dbscan/dbscan_api.h +++ b/cpp/src/dbscan/dbscan_api.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/dbscan/mergelabels/runner.cuh b/cpp/src/dbscan/mergelabels/runner.cuh new file mode 100644 index 0000000000..1fc60144c0 --- /dev/null +++ b/cpp/src/dbscan/mergelabels/runner.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include