From da872572d5852cdd7d024f16f625abecf992e451 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 1 Feb 2021 13:24:08 -0500
Subject: [PATCH 01/29] Prepare Changelog for Automation (#3442)

This PR prepares the changelog to be automatically updated during releases.

Authors:
  - AJ Schmidt (@ajschmidt8)

Approvers:
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/cuml/pull/3442
---
 CHANGELOG.md | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36cbbf0cae..30ee4f829a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,13 +1,8 @@
-# cuML 0.18.0 (Date TBD)
+# 0.18.0
 
-## New Features
-
-## Improvements
-
-## Bug Fixes
-- PR #3279: Correct pure virtual declaration in manifold_inputs_t
+Please see https://github.com/rapidsai/cuml/releases/tag/branch-0.18-latest for the latest changes to this development branch.
 
-# cuML 0.17.0 (Date TBD)
+# cuML 0.17.0 (10 Dec 2020)
 
 ## New Features
 - PR #3164: Expose silhouette score in Python

From df67553234d3faead723b345b323155b1b8ae0ca Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 1 Feb 2021 13:30:05 -0600
Subject: [PATCH 02/29] Allow saving Dask RandomForest models immediately after
 training (fixes #3331) (#3388)

This attempts to fix #3331. See that issue for a lot more details.

Today, `.get_combined_model()` for the Dask RandomForest model objects returns `None` if it's called immediately after training. That pattern is recommended in ["Distributed Model Pickling"](https://docs.rapids.ai/api/cuml/stable/pickling_cuml_models.html#Distributed-Model-Pickling). Without this support, there is not a way to save a Dask RandomForest model using only public methods / attributes on those classes.

Per https://github.com/rapidsai/cuml/issues/3331#issuecomment-754125703, this PR proposes populating the internal model object whenever `get_combined_model()` is called.

## Notes for Reviewers

* I have not tested this locally. I spent about 3 hours trying to build `cuml` from source following https://github.com/rapidsai/cuml/blob/main/BUILD.md, and was not successful. If there is a containerized setup for developing `cuml`, I'd greatly appreciate it and would be happy to try it out. I've added a unit test for this change, so I hope that will be enough to confirm that this works and that CI will catch any mistakes I've made.

Thanks for your time and consideration.

Authors:
  - James Lamb (@jameslamb)
  - John Zedlewski (@JohnZed)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3388
---
 python/cuml/dask/ensemble/base.py           | 31 +++++++++++++++
 python/cuml/test/dask/test_random_forest.py | 44 +++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py
index 45e059c0fc..6505a44c1c 100644
--- a/python/cuml/dask/ensemble/base.py
+++ b/python/cuml/dask/ensemble/base.py
@@ -19,6 +19,9 @@
 import numpy as np
 import warnings
 
+from collections.abc import Iterable
+from dask.distributed import Future
+
 from cuml.dask.common.input_utils import DistributedDataHandler, \
     concatenate
 from cuml.dask.common.utils import get_client, wait_and_raise_from_futures
@@ -257,6 +260,34 @@ def _get_json(self):
             combined_dump.extend(obj)
         return json.dumps(combined_dump)
 
+    def get_combined_model(self):
+        """
+        Return single-GPU model for serialization.
+
+        Returns
+        -------
+
+        model : Trained single-GPU model or None if the model has not
+               yet been trained.
+        """
+
+        # set internal model if it hasn't been accessed before
+        if self._get_internal_model() is None:
+            self._set_internal_model(self._concat_treelite_models())
+
+        internal_model = self._check_internal_model(self._get_internal_model())
+
+        if isinstance(self.internal_model, Iterable):
+            # This function needs to return a single instance of cuml.Base,
+            # even if the class is just a composite.
+            raise ValueError("Expected a single instance of cuml.Base "
+                             "but got %s instead." % type(self.internal_model))
+
+        elif isinstance(self.internal_model, Future):
+            internal_model = self.internal_model.result()
+
+        return internal_model
+
 
 def _func_fit(model, input_data, convert_dtype):
     X = concatenate([item[0] for item in input_data])
diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py
index 305427383e..3d97f1f381 100644
--- a/python/cuml/test/dask/test_random_forest.py
+++ b/python/cuml/test/dask/test_random_forest.py
@@ -43,6 +43,9 @@
 from cuml.dask.ensemble import RandomForestRegressor as cuRFR_mg
 from cuml.dask.common import utils as dask_utils
 
+from cuml.ensemble import RandomForestClassifier as cuRFC_sg
+from cuml.ensemble import RandomForestRegressor as cuRFR_sg
+
 from dask.array import from_array
 from sklearn.datasets import make_regression, make_classification
 from sklearn.model_selection import train_test_split
@@ -436,6 +439,47 @@ def predict_with_json_rf_regressor(rf, x):
         np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
 
 
+@pytest.mark.parametrize('estimator_type', ['regression', 'classification'])
+def test_rf_get_combined_model_right_aftter_fit(client, estimator_type):
+    max_depth = 3
+    n_estimators = 5
+    X, y = make_classification()
+    X = X.astype(np.float32)
+    if estimator_type == 'classification':
+        cu_rf_mg = cuRFC_mg(
+            max_features=1.0,
+            max_samples=1.0,
+            n_bins=16,
+            n_streams=1,
+            n_estimators=n_estimators,
+            max_leaves=-1,
+            max_depth=max_depth
+        )
+        y = y.astype(np.int32)
+    elif estimator_type == 'regression':
+        cu_rf_mg = cuRFR_mg(
+            max_features=1.0,
+            max_samples=1.0,
+            n_bins=16,
+            n_streams=1,
+            n_estimators=n_estimators,
+            max_leaves=-1,
+            max_depth=max_depth
+        )
+        y = y.astype(np.float32)
+    else:
+        assert False
+    X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
+    cu_rf_mg.fit(X_dask, y_dask)
+    single_gpu_model = cu_rf_mg.get_combined_model()
+    if estimator_type == 'classification':
+        assert isinstance(single_gpu_model, cuRFC_sg)
+    elif estimator_type == 'regression':
+        assert isinstance(single_gpu_model, cuRFR_sg)
+    else:
+        assert False
+
+
 @pytest.mark.parametrize('n_estimators', [5, 10, 20])
 @pytest.mark.parametrize('detailed_text', [True, False])
 def test_rf_get_text(client, n_estimators, detailed_text):

From fa9edbc268ac93e5ca6ed69704f6cd0c55b25f4c Mon Sep 17 00:00:00 2001
From: Vinay Deshpande <vinayd@nvidia.com>
Date: Tue, 2 Feb 2021 20:53:24 +0530
Subject: [PATCH 03/29] Enable feature sampling for the experimental backend of
 Random Forest (#3364)

Feature sampling was not supported for experimental backend of Random Forest (RF). This PR implements feature sampling without needing any auxiliary memory storage. Thus the `colids` array is also removed.

Authors:
  - Vinay Deshpande (@vinaydes)

Approvers:
  - Thejaswi. N. S (@teju85)
  - Philip Hyunsu Cho (@hcho3)
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3364
---
 cpp/bench/sg/fil.cu                           | 15 ++--
 cpp/bench/sg/rf_classifier.cu                 | 14 +--
 cpp/bench/sg/rf_regressor.cu                  | 14 +--
 cpp/include/cuml/ensemble/randomforest.hpp    | 11 +--
 cpp/include/cuml/tree/decisiontree.hpp        | 16 ++--
 cpp/include/cuml/tree/flatnode.h              |  1 +
 .../batched-levelalgo/builder.cuh             | 35 ++++----
 .../batched-levelalgo/builder_base.cuh        | 25 ++++--
 .../decisiontree/batched-levelalgo/input.cuh  |  4 +-
 .../batched-levelalgo/kernels.cuh             | 90 +++++++++++++++++--
 .../decisiontree/batched-levelalgo/node.cuh   |  4 +-
 .../decisiontree/batched-levelalgo/split.cuh  |  4 +-
 cpp/src/decisiontree/decisiontree.cu          | 28 +++---
 cpp/src/decisiontree/decisiontree_impl.cuh    | 41 +++++----
 cpp/src/decisiontree/decisiontree_impl.h      | 12 +--
 cpp/src/randomforest/randomforest.cu          |  8 +-
 cpp/src/randomforest/randomforest_impl.cuh    |  9 +-
 cpp/test/sg/decisiontree_batchedlevel_algo.cu | 19 ++--
 .../sg/decisiontree_batchedlevel_unittest.cu  | 12 +--
 cpp/test/sg/rf_accuracy_test.cu               |  4 +-
 cpp/test/sg/rf_batched_classification_test.cu |  4 +-
 cpp/test/sg/rf_batched_regression_test.cu     |  4 +-
 cpp/test/sg/rf_depth_test.cu                  |  6 +-
 cpp/test/sg/rf_test.cu                        |  6 +-
 cpp/test/sg/rf_treelite_test.cu               |  4 +-
 python/cuml/ensemble/randomforest_shared.pxd  |  6 +-
 .../cuml/ensemble/randomforestclassifier.pyx  |  7 +-
 .../cuml/ensemble/randomforestregressor.pyx   |  7 +-
 python/cuml/test/test_random_forest.py        |  6 +-
 29 files changed, 253 insertions(+), 163 deletions(-)

diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
index a5a9d7579c..7a80effc64 100644
--- a/cpp/bench/sg/fil.cu
+++ b/cpp/bench/sg/fil.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,12 +143,13 @@ std::vector<Params> getInputs() {
     .shuffle = false,
     .seed = 12345ULL};
 
-  set_rf_params(p.rf,  // Output RF parameters
-                1,  // n_trees, just a placeholder value, anyway changed below
-                true,  // bootstrap
-                1.f,   // max_samples
-                1234,  // seed
-                8);    // n_streams
+  set_rf_params(p.rf,     // Output RF parameters
+                1,        // n_trees, just a placeholder value,
+                          //   anyway changed below
+                true,     // bootstrap
+                1.f,      // max_samples
+                1234ULL,  // seed
+                8);       // n_streams
 
   set_tree_params(p.rf.tree_params,    // Output tree parameters
                   10,                  // max_depth, just a placeholder value,
diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu
index 5e2f4f3fcf..29a9d0c371 100644
--- a/cpp/bench/sg/rf_classifier.cu
+++ b/cpp/bench/sg/rf_classifier.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,12 +83,12 @@ std::vector<Params> getInputs() {
              10.0,         // center_box_max
              2152953ULL};  //seed
 
-  set_rf_params(p.rf,  // Output RF parameters
-                500,   // n_trees
-                true,  // bootstrap
-                1.f,   // max_samples
-                1234,  // seed
-                8);    // n_streams
+  set_rf_params(p.rf,     // Output RF parameters
+                500,      // n_trees
+                true,     // bootstrap
+                1.f,      // max_samples
+                1234ULL,  // seed
+                8);       // n_streams
 
   set_tree_params(p.rf.tree_params,  // Output tree parameters
                   10,                // max_depth, this is anyway changed below
diff --git a/cpp/bench/sg/rf_regressor.cu b/cpp/bench/sg/rf_regressor.cu
index 59d01770ec..a6771ae567 100644
--- a/cpp/bench/sg/rf_regressor.cu
+++ b/cpp/bench/sg/rf_regressor.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,12 +85,12 @@ std::vector<RegParams> getInputs() {
     .noise = 1.0,
     .seed = 12345ULL};
 
-  set_rf_params(p.rf,  // Output RF parameters
-                500,   // n_trees
-                true,  // bootstrap
-                1.f,   // max_samples
-                1234,  // seed
-                8);    // n_streams
+  set_rf_params(p.rf,     // Output RF parameters
+                500,      // n_trees
+                true,     // bootstrap
+                1.f,      // max_samples
+                1234ULL,  // seed
+                8);       // n_streams
 
   set_tree_params(p.rf.tree_params,  // Output tree parameters
                   10,                // max_depth, just a place holder value,
diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
index 32a9447a63..525d6e07ea 100644
--- a/cpp/include/cuml/ensemble/randomforest.hpp
+++ b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ struct RF_params {
   /**
    * random seed
    */
-  int seed;
+  uint64_t seed;
   /**
    * Number of concurrent GPU streams for parallel tree building.
    * Each stream is independently managed by CPU thread.
@@ -89,9 +89,10 @@ struct RF_params {
 
 void set_rf_params(RF_params& params, int cfg_n_trees = 1,
                    bool cfg_bootstrap = true, float cfg_max_samples = 1.0f,
-                   int cfg_seed = -1, int cfg_n_streams = 8);
+                   uint64_t cfg_seed = 0, int cfg_n_streams = 8);
 void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
-                       float cfg_max_samples, int cfg_seed, int cfg_n_streams,
+                       float cfg_max_samples, uint64_t cfg_seed,
+                       int cfg_n_streams,
                        DecisionTree::DecisionTreeParams cfg_tree_params);
 void validity_check(const RF_params rf_params);
 void print(const RF_params rf_params);
@@ -190,7 +191,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features,
                            int n_bins, int split_algo, int min_samples_leaf,
                            int min_samples_split, float min_impurity_decrease,
                            bool bootstrap_features, bool bootstrap, int n_trees,
-                           float max_samples, int seed,
+                           float max_samples, uint64_t seed,
                            CRITERION split_criterion, bool quantile_per_tree,
                            int cfg_n_streams, bool use_experimental_backend,
                            int max_batch_size);
diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp
index 523536daf8..91b03a87d5 100644
--- a/cpp/include/cuml/tree/decisiontree.hpp
+++ b/cpp/include/cuml/tree/decisiontree.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,6 +200,7 @@ typedef TreeMetaDataNode<double, int> TreeClassifierD;
  * @param[in] n_unique_labels: number of unique label values. Number of
  *                             categories of classification.
  * @param[in] tree_params: Decision Tree training hyper parameter struct.
+ * @param[in] seed: Controls the randomness in tree fitting/growing algorithm.
  * @{
  */
 void decisionTreeClassifierFit(const raft::handle_t &handle,
@@ -207,13 +208,15 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
                                const int ncols, const int nrows, int *labels,
                                unsigned int *rowids, const int n_sampled_rows,
                                int unique_labels,
-                               DecisionTree::DecisionTreeParams tree_params);
+                               DecisionTree::DecisionTreeParams tree_params,
+                               uint64_t seed);
 void decisionTreeClassifierFit(const raft::handle_t &handle,
                                TreeClassifierD *&tree, double *data,
                                const int ncols, const int nrows, int *labels,
                                unsigned int *rowids, const int n_sampled_rows,
                                int unique_labels,
-                               DecisionTree::DecisionTreeParams tree_params);
+                               DecisionTree::DecisionTreeParams tree_params,
+                               uint64_t seed);
 /** @} */
 
 /**
@@ -268,18 +271,21 @@ typedef TreeMetaDataNode<double, double> TreeRegressorD;
  * @param[in] n_sampled_rows: number of training samples, after sampling. If using decision
  *   tree directly over the whole dataset: n_sampled_rows = nrows
  * @param[in] tree_params: Decision Tree training hyper parameter struct.
+ * @param[in] seed: Controls the randomness in tree fitting/growing algorithm.
  * @{
  */
 void decisionTreeRegressorFit(const raft::handle_t &handle,
                               TreeRegressorF *&tree, float *data,
                               const int ncols, const int nrows, float *labels,
                               unsigned int *rowids, const int n_sampled_rows,
-                              DecisionTree::DecisionTreeParams tree_params);
+                              DecisionTree::DecisionTreeParams tree_params,
+                              uint64_t seed);
 void decisionTreeRegressorFit(const raft::handle_t &handle,
                               TreeRegressorD *&tree, double *data,
                               const int ncols, const int nrows, double *labels,
                               unsigned int *rowids, const int n_sampled_rows,
-                              DecisionTree::DecisionTreeParams tree_params);
+                              DecisionTree::DecisionTreeParams tree_params,
+                              uint64_t seed);
 /** @} */
 
 /**
diff --git a/cpp/include/cuml/tree/flatnode.h b/cpp/include/cuml/tree/flatnode.h
index 329961567b..138182e550 100644
--- a/cpp/include/cuml/tree/flatnode.h
+++ b/cpp/include/cuml/tree/flatnode.h
@@ -31,6 +31,7 @@ struct SparseTreeNode {
   DataT quesval;
   DataT best_metric_val;
   IdxT left_child_id = IdxT(-1);
+  uint32_t unique_id = UINT32_MAX;
 };
 
 template <typename T, typename L>
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index 79d18022b1..5100b6038a 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,17 +42,18 @@ template <typename Traits, typename DataT = typename Traits::DataT,
           typename IdxT = typename Traits::IdxT>
 void grow_tree(std::shared_ptr<MLCommon::deviceAllocator> d_allocator,
                std::shared_ptr<MLCommon::hostAllocator> h_allocator,
-               const DataT* data, IdxT ncols, IdxT nrows, const LabelT* labels,
-               const DataT* quantiles, IdxT* rowids, IdxT* colids,
-               int n_sampled_rows, int unique_labels,
+               const DataT* data, IdxT treeid, uint64_t seed, IdxT ncols,
+               IdxT nrows, const LabelT* labels, const DataT* quantiles,
+               IdxT* rowids, int n_sampled_rows, int unique_labels,
                const DecisionTreeParams& params, cudaStream_t stream,
                std::vector<SparseTreeNode<DataT, LabelT>>& sparsetree,
                IdxT& num_leaves, IdxT& depth) {
   Builder<Traits> builder;
   size_t d_wsize, h_wsize;
-  builder.workspaceSize(d_wsize, h_wsize, params, data, labels, nrows, ncols,
-                        n_sampled_rows, IdxT(params.max_features * ncols),
-                        rowids, colids, unique_labels, quantiles);
+  builder.workspaceSize(d_wsize, h_wsize, treeid, seed, params, data, labels,
+                        nrows, ncols, n_sampled_rows,
+                        IdxT(params.max_features * ncols), rowids,
+                        unique_labels, quantiles);
   MLCommon::device_buffer<char> d_buff(d_allocator, stream, d_wsize);
   MLCommon::host_buffer<char> h_buff(h_allocator, stream, h_wsize);
 
@@ -100,29 +101,29 @@ void grow_tree(std::shared_ptr<MLCommon::deviceAllocator> d_allocator,
 template <typename DataT, typename LabelT, typename IdxT>
 void grow_tree(std::shared_ptr<MLCommon::deviceAllocator> d_allocator,
                std::shared_ptr<MLCommon::hostAllocator> h_allocator,
-               const DataT* data, IdxT ncols, IdxT nrows, const LabelT* labels,
-               const DataT* quantiles, IdxT* rowids, IdxT* colids,
-               int n_sampled_rows, int unique_labels,
+               const DataT* data, IdxT treeid, uint64_t seed, IdxT ncols,
+               IdxT nrows, const LabelT* labels, const DataT* quantiles,
+               IdxT* rowids, int n_sampled_rows, int unique_labels,
                const DecisionTreeParams& params, cudaStream_t stream,
                std::vector<SparseTreeNode<DataT, LabelT>>& sparsetree,
                IdxT& num_leaves, IdxT& depth) {
   typedef ClsTraits<DataT, LabelT, IdxT> Traits;
-  grow_tree<Traits>(d_allocator, h_allocator, data, ncols, nrows, labels,
-                    quantiles, rowids, colids, n_sampled_rows, unique_labels,
+  grow_tree<Traits>(d_allocator, h_allocator, data, treeid, seed, ncols, nrows,
+                    labels, quantiles, rowids, n_sampled_rows, unique_labels,
                     params, stream, sparsetree, num_leaves, depth);
 }
 template <typename DataT, typename IdxT>
 void grow_tree(std::shared_ptr<MLCommon::deviceAllocator> d_allocator,
                std::shared_ptr<MLCommon::hostAllocator> h_allocator,
-               const DataT* data, IdxT ncols, IdxT nrows, const DataT* labels,
-               const DataT* quantiles, IdxT* rowids, IdxT* colids,
-               int n_sampled_rows, int unique_labels,
+               const DataT* data, IdxT treeid, uint64_t seed, IdxT ncols,
+               IdxT nrows, const DataT* labels, const DataT* quantiles,
+               IdxT* rowids, int n_sampled_rows, int unique_labels,
                const DecisionTreeParams& params, cudaStream_t stream,
                std::vector<SparseTreeNode<DataT, DataT>>& sparsetree,
                IdxT& num_leaves, IdxT& depth) {
   typedef RegTraits<DataT, IdxT> Traits;
-  grow_tree<Traits>(d_allocator, h_allocator, data, ncols, nrows, labels,
-                    quantiles, rowids, colids, n_sampled_rows, unique_labels,
+  grow_tree<Traits>(d_allocator, h_allocator, data, treeid, seed, ncols, nrows,
+                    labels, quantiles, rowids, n_sampled_rows, unique_labels,
                     params, stream, sparsetree, num_leaves, depth);
 }
 /** @} */
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
index 030b08e8e8..79ac2ddda5 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,6 +60,10 @@ struct Builder {
   /** size of block-sync workspace (regression + MAE only) */
   size_t block_sync_size;
 
+  /** Tree index */
+  IdxT treeid;
+  /** Seed used for randomization */
+  uint64_t seed;
   /** number of nodes created in the current batch */
   IdxT* n_nodes;
   /** class histograms (classification only) */
@@ -133,14 +137,16 @@ struct Builder {
    *                        be computed fresh. [on device] [col-major]
    *                        [dim = nbins x sampledCols]
    */
-  void workspaceSize(size_t& d_wsize, size_t& h_wsize,
-                     const DecisionTreeParams& p, const DataT* data,
-                     const LabelT* labels, IdxT totalRows, IdxT totalCols,
-                     IdxT sampledRows, IdxT sampledCols, IdxT* rowids,
-                     IdxT* colids, IdxT nclasses, const DataT* quantiles) {
+  void workspaceSize(size_t& d_wsize, size_t& h_wsize, IdxT treeid,
+                     uint64_t seed, const DecisionTreeParams& p,
+                     const DataT* data, const LabelT* labels, IdxT totalRows,
+                     IdxT totalCols, IdxT sampledRows, IdxT sampledCols,
+                     IdxT* rowids, IdxT nclasses, const DataT* quantiles) {
     ASSERT(quantiles != nullptr,
            "Currently quantiles need to be computed before this call!");
     params = p;
+    this->treeid = treeid;
+    this->seed = seed;
     n_blks_for_cols = std::min(sampledCols, n_blks_for_cols);
     input.data = data;
     input.labels = labels;
@@ -149,7 +155,6 @@ struct Builder {
     input.nSampledRows = sampledRows;
     input.nSampledCols = sampledCols;
     input.rowids = rowids;
-    input.colids = colids;
     input.nclasses = nclasses;
     input.quantiles = quantiles;
     auto max_batch = params.max_batch_size;
@@ -294,6 +299,7 @@ struct Builder {
     h_nodes[0].start = 0;
     h_nodes[0].count = input.nSampledRows;
     h_nodes[0].depth = 0;
+    h_nodes[0].info.unique_id = 0;
   }
 
   /** check whether any more nodes need to be processed or not */
@@ -323,6 +329,7 @@ struct Builder {
     // start fresh on the number of *new* nodes created in this batch
     CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), s));
     initSplit<DataT, IdxT, Traits::TPB_DEFAULT>(splits, batchSize, s);
+
     // get the current set of nodes to be worked upon
     raft::update_device(curr_nodes, h_nodes.data() + node_start, batchSize, s);
     // iterate through a batch of columns (to reduce the memory pressure) and
@@ -404,7 +411,7 @@ struct ClsTraits {
         b.hist, b.params.n_bins, b.params.max_depth, b.params.min_samples_split,
         b.params.min_samples_leaf, b.params.min_impurity_decrease,
         b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex,
-        b.n_leaves, b.splits, splitType);
+        b.n_leaves, b.splits, splitType, b.treeid, b.seed);
   }
 
   /**
@@ -480,7 +487,7 @@ struct RegTraits {
         b.params.max_depth, b.params.min_samples_split,
         b.params.min_samples_leaf, b.params.min_impurity_decrease,
         b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex,
-        b.n_leaves, b.splits, b.block_sync, splitType);
+        b.n_leaves, b.splits, b.block_sync, splitType, b.treeid, b.seed);
   }
 
   /**
diff --git a/cpp/src/decisiontree/batched-levelalgo/input.cuh b/cpp/src/decisiontree/batched-levelalgo/input.cuh
index 7e3a54c908..c87032ee9a 100644
--- a/cpp/src/decisiontree/batched-levelalgo/input.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/input.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,8 +35,6 @@ struct Input {
   IdxT nSampledCols;
   /** indices of sampled rows */
   IdxT* rowids;
-  /** indices of sampled cols */
-  IdxT* colids;
   /** number of classes (useful only in classification) */
   IdxT nclasses;
   /** quantiles/histogram computed on the dataset (col-major) */
diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
index 40ae3de461..1a35eb9f40 100644
--- a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -264,13 +264,78 @@ __device__ OutT* alignPointer(InT input) {
     raft::alignTo(reinterpret_cast<size_t>(input), sizeof(OutT)));
 }
 
+// 32-bit FNV1a hash
+// Reference: http://www.isthe.com/chongo/tech/comp/fnv/index.html
+const uint32_t fnv1a32_prime = uint32_t(16777619);
+const uint32_t fnv1a32_basis = uint32_t(2166136261);
+
+DI uint32_t fnv1a32(uint32_t hash, uint32_t txt) {
+  hash ^= (txt >> 0) & 0xFF;
+  hash *= fnv1a32_prime;
+  hash ^= (txt >> 8) & 0xFF;
+  hash *= fnv1a32_prime;
+  hash ^= (txt >> 16) & 0xFF;
+  hash *= fnv1a32_prime;
+  hash ^= (txt >> 24) & 0xFF;
+  hash *= fnv1a32_prime;
+  return hash;
+}
+
+/**
+ * @brief For a given values of (treeid, nodeid, seed), this function generates
+ *        a unique permutation of [0, N - 1] values and returns 'k'th entry in
+ *        from the permutation.
+ * @return The 'k'th value from the permutation
+ * @note This function does not allocated any temporary buffer, all the
+ *       necessary values are recomputed.
+ */
+template <typename IdxT>
+DI IdxT select(IdxT k, IdxT treeid, uint32_t nodeid, uint64_t seed, IdxT N) {
+  __shared__ int blksum;
+  uint32_t pivot_hash;
+  int cnt = 0;
+
+  if (threadIdx.x == 0) {
+    blksum = 0;
+  }
+  // Compute hash for the 'k'th index and use it as pivote for sorting
+  pivot_hash = fnv1a32_basis;
+  pivot_hash = fnv1a32(pivot_hash, uint32_t(k));
+  pivot_hash = fnv1a32(pivot_hash, uint32_t(treeid));
+  pivot_hash = fnv1a32(pivot_hash, uint32_t(nodeid));
+  pivot_hash = fnv1a32(pivot_hash, uint32_t(seed >> 32));
+  pivot_hash = fnv1a32(pivot_hash, uint32_t(seed));
+
+  // Compute hash for rest of the indices and count instances where i_hash is
+  // less than pivot_hash
+  uint32_t i_hash;
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    if (i == k) continue;  // Skip since k is the pivote index
+    i_hash = fnv1a32_basis;
+    i_hash = fnv1a32(i_hash, uint32_t(i));
+    i_hash = fnv1a32(i_hash, uint32_t(treeid));
+    i_hash = fnv1a32(i_hash, uint32_t(nodeid));
+    i_hash = fnv1a32(i_hash, uint32_t(seed >> 32));
+    i_hash = fnv1a32(i_hash, uint32_t(seed));
+
+    if (i_hash < pivot_hash)
+      cnt++;
+    else if (i_hash == pivot_hash && i < k)
+      cnt++;
+  }
+  __syncthreads();
+  if (cnt > 0) atomicAdd(&blksum, cnt);
+  __syncthreads();
+  return blksum;
+}
+
 template <typename DataT, typename LabelT, typename IdxT, int TPB>
 __global__ void computeSplitClassificationKernel(
   int* hist, IdxT nbins, IdxT max_depth, IdxT min_samples_split,
   IdxT min_samples_leaf, DataT min_impurity_decrease, IdxT max_leaves,
   Input<DataT, LabelT, IdxT> input, const Node<DataT, LabelT, IdxT>* nodes,
   IdxT colStart, int* done_count, int* mutex, const IdxT* n_leaves,
-  Split<DataT, IdxT>* splits, CRITERION splitType) {
+  Split<DataT, IdxT>* splits, CRITERION splitType, IdxT treeid, uint64_t seed) {
   extern __shared__ char smem[];
   IdxT nid = blockIdx.z;
   auto node = nodes[nid];
@@ -288,7 +353,15 @@ __global__ void computeSplitClassificationKernel(
   auto* sDone = alignPointer<int>(sbins + nbins);
   IdxT stride = blockDim.x * gridDim.x;
   IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
-  auto col = input.colids[colStart + blockIdx.y];
+
+  IdxT col;
+  if (input.nSampledCols == input.N) {
+    col = colStart + blockIdx.y;
+  } else {
+    int colIndex = colStart + blockIdx.y;
+    col = select(colIndex, treeid, node.info.unique_id, seed, input.N);
+  }
+
   for (IdxT i = threadIdx.x; i < len; i += blockDim.x) shist[i] = 0;
   for (IdxT b = threadIdx.x; b < nbins; b += blockDim.x)
     sbins[b] = input.quantiles[col * nbins + b];
@@ -343,7 +416,8 @@ __global__ void computeSplitRegressionKernel(
   DataT min_impurity_decrease, IdxT max_leaves,
   Input<DataT, LabelT, IdxT> input, const Node<DataT, LabelT, IdxT>* nodes,
   IdxT colStart, int* done_count, int* mutex, const IdxT* n_leaves,
-  Split<DataT, IdxT>* splits, void* workspace, CRITERION splitType) {
+  Split<DataT, IdxT>* splits, void* workspace, CRITERION splitType, IdxT treeid,
+  uint64_t seed) {
   extern __shared__ char smem[];
   IdxT nid = blockIdx.z;
   auto node = nodes[nid];
@@ -364,7 +438,13 @@ __global__ void computeSplitRegressionKernel(
   auto* sDone = alignPointer<int>(spredP + nbins);
   IdxT stride = blockDim.x * gridDim.x;
   IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
-  auto col = input.colids[colStart + blockIdx.y];
+  IdxT col;
+  if (input.nSampledCols == input.N) {
+    col = colStart + blockIdx.y;
+  } else {
+    int colIndex = colStart + blockIdx.y;
+    col = select(colIndex, treeid, node.info.unique_id, seed, input.N);
+  }
   for (IdxT i = threadIdx.x; i < len; i += blockDim.x) {
     spred[i] = DataT(0.0);
   }
diff --git a/cpp/src/decisiontree/batched-levelalgo/node.cuh b/cpp/src/decisiontree/batched-levelalgo/node.cuh
index 827ef914a2..68c4652066 100644
--- a/cpp/src/decisiontree/batched-levelalgo/node.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/node.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,12 +104,14 @@ struct Node {
     nodes[pos].depth = depth + 1;
     nodes[pos].start = start;
     nodes[pos].count = split.nLeft;
+    nodes[pos].info.unique_id = 2 * info.unique_id + 1;
     // right
     ++pos;
     nodes[pos].initSpNode();
     nodes[pos].depth = depth + 1;
     nodes[pos].start = start + split.nLeft;
     nodes[pos].count = count - split.nLeft;
+    nodes[pos].info.unique_id = 2 * info.unique_id + 1;
     // update depth
     auto val = atomicMax(n_depth, depth + 1);
     __threadfence();
diff --git a/cpp/src/decisiontree/batched-levelalgo/split.cuh b/cpp/src/decisiontree/batched-levelalgo/split.cuh
index e05091ded5..3417d540bd 100644
--- a/cpp/src/decisiontree/batched-levelalgo/split.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/split.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,7 +148,7 @@ void initSplit(Split<DataT, IdxT>* splits, IdxT len, cudaStream_t s) {
 template <typename DataT, typename IdxT, int TPB = 256>
 void printSplits(Split<DataT, IdxT>* splits, IdxT len, cudaStream_t s) {
   auto op = [] __device__(Split<DataT, IdxT> * ptr, IdxT idx) {
-    printf("quesval = %f, colid = %d, best_metric_val = %f, nLeft = %d\n",
+    printf("quesval = %e, colid = %d, best_metric_val = %e, nLeft = %d\n",
            ptr->quesval, ptr->colid, ptr->best_metric_val, ptr->nLeft);
   };
   raft::linalg::writeOnlyUnaryOp<Split<DataT, IdxT>, decltype(op), IdxT, TPB>(
diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu
index 7d6e900061..8f3ded17da 100644
--- a/cpp/src/decisiontree/decisiontree.cu
+++ b/cpp/src/decisiontree/decisiontree.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,12 +56,6 @@ void set_tree_params(DecisionTreeParams &params, int cfg_max_depth,
         "To use experimental backend set split_algo = 1 (GLOBAL_QUANTILE)");
       cfg_use_experimental_backend = false;
     }
-    if (cfg_max_features != 1.0) {
-      CUML_LOG_WARN(
-        "Experimental backend does not yet support feature sub-sampling");
-      CUML_LOG_WARN("To use experimental backend set max_features = 1.0");
-      cfg_use_experimental_backend = false;
-    }
     if (cfg_quantile_per_tree) {
       CUML_LOG_WARN(
         "Experimental backend does not yet support per tree quantile "
@@ -160,11 +154,12 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
                                const int ncols, const int nrows, int *labels,
                                unsigned int *rowids, const int n_sampled_rows,
                                int unique_labels,
-                               DecisionTree::DecisionTreeParams tree_params) {
+                               DecisionTree::DecisionTreeParams tree_params,
+                               uint64_t seed) {
   std::shared_ptr<DecisionTreeClassifier<float>> dt_classifier =
     std::make_shared<DecisionTreeClassifier<float>>();
   dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                     unique_labels, tree, tree_params);
+                     unique_labels, tree, tree_params, seed);
 }
 
 void decisionTreeClassifierFit(const raft::handle_t &handle,
@@ -172,11 +167,12 @@ void decisionTreeClassifierFit(const raft::handle_t &handle,
                                const int ncols, const int nrows, int *labels,
                                unsigned int *rowids, const int n_sampled_rows,
                                int unique_labels,
-                               DecisionTree::DecisionTreeParams tree_params) {
+                               DecisionTree::DecisionTreeParams tree_params,
+                               uint64_t seed) {
   std::shared_ptr<DecisionTreeClassifier<double>> dt_classifier =
     std::make_shared<DecisionTreeClassifier<double>>();
   dt_classifier->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                     unique_labels, tree, tree_params);
+                     unique_labels, tree, tree_params, seed);
 }
 
 void decisionTreeClassifierPredict(const raft::handle_t &handle,
@@ -207,22 +203,24 @@ void decisionTreeRegressorFit(const raft::handle_t &handle,
                               TreeRegressorF *&tree, float *data,
                               const int ncols, const int nrows, float *labels,
                               unsigned int *rowids, const int n_sampled_rows,
-                              DecisionTree::DecisionTreeParams tree_params) {
+                              DecisionTree::DecisionTreeParams tree_params,
+                              uint64_t seed) {
   std::shared_ptr<DecisionTreeRegressor<float>> dt_regressor =
     std::make_shared<DecisionTreeRegressor<float>>();
   dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                    tree, tree_params);
+                    tree, tree_params, seed);
 }
 
 void decisionTreeRegressorFit(const raft::handle_t &handle,
                               TreeRegressorD *&tree, double *data,
                               const int ncols, const int nrows, double *labels,
                               unsigned int *rowids, const int n_sampled_rows,
-                              DecisionTree::DecisionTreeParams tree_params) {
+                              DecisionTree::DecisionTreeParams tree_params,
+                              uint64_t seed) {
   std::shared_ptr<DecisionTreeRegressor<double>> dt_regressor =
     std::make_shared<DecisionTreeRegressor<double>>();
   dt_regressor->fit(handle, data, ncols, nrows, labels, rowids, n_sampled_rows,
-                    tree, tree_params);
+                    tree, tree_params, seed);
 }
 
 void decisionTreeRegressorPredict(const raft::handle_t &handle,
diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh
index b1f70818a0..f742643471 100644
--- a/cpp/src/decisiontree/decisiontree_impl.cuh
+++ b/cpp/src/decisiontree/decisiontree_impl.cuh
@@ -51,10 +51,15 @@ namespace DecisionTree {
 template <class T, class L>
 void print(const SparseTreeNode<T, L> &node, std::ostream &os) {
   if (node.colid == -1) {
-    os << "(leaf, " << node.prediction << ", " << node.best_metric_val << ")";
+    os << "(leaf, "
+       << "prediction: " << node.prediction
+       << ", best_metric_val: " << node.best_metric_val
+       << ", UID: " << node.unique_id << ")";
   } else {
-    os << "(" << node.colid << ", " << node.quesval << ", "
-       << node.best_metric_val << ")";
+    os << "("
+       << "colid: " << node.colid << ", quesval: " << node.quesval
+       << ", best_metric_val: " << node.best_metric_val
+       << ", UID: " << node.unique_id << ")";
   }
   return;
 }
@@ -255,7 +260,8 @@ template <typename T, typename L>
 void DecisionTreeBase<T, L>::plant(
   std::vector<SparseTreeNode<T, L>> &sparsetree, const T *data, const int ncols,
   const int nrows, const L *labels, unsigned int *rowids,
-  const int n_sampled_rows, int unique_labels, const int treeid) {
+  const int n_sampled_rows, int unique_labels, const int treeid,
+  uint64_t seed) {
   dinfo.NLocalrows = nrows;
   dinfo.NGlobalrows = nrows;
   dinfo.Ncols = ncols;
@@ -288,11 +294,8 @@ void DecisionTreeBase<T, L>::plant(
       CUML_LOG_WARN("Using experimental backend for growing trees\n");
     }
     T *quantiles = tempmem->d_quantile->data();
-    int *colids = (int *)tempmem->device_allocator->allocate(
-      sizeof(int) * ncols, tempmem->stream);
-    MLCommon::iota(colids, 0, 1, ncols, tempmem->stream);
-    grow_tree(tempmem->device_allocator, tempmem->host_allocator, data, ncols,
-              nrows, labels, quantiles, (int *)rowids, (int *)colids,
+    grow_tree(tempmem->device_allocator, tempmem->host_allocator, data, treeid,
+              seed, ncols, nrows, labels, quantiles, (int *)rowids,
               n_sampled_rows, unique_labels, tree_params, tempmem->stream,
               sparsetree, this->leaf_counter, this->depth_counter);
   } else {
@@ -371,7 +374,7 @@ void DecisionTreeBase<T, L>::base_fit(
   const cudaStream_t stream_in, const T *data, const int ncols, const int nrows,
   const L *labels, unsigned int *rowids, const int n_sampled_rows,
   int unique_labels, std::vector<SparseTreeNode<T, L>> &sparsetree,
-  const int treeid, bool is_classifier,
+  const int treeid, uint64_t seed, bool is_classifier,
   std::shared_ptr<TemporaryMemory<T, L>> in_tempmem) {
   prepare_fit_timer.reset();
   const char *CRITERION_NAME[] = {"GINI", "ENTROPY", "MSE", "MAE", "END"};
@@ -408,7 +411,7 @@ void DecisionTreeBase<T, L>::base_fit(
   }
 
   plant(sparsetree, data, ncols, nrows, labels, rowids, n_sampled_rows,
-        unique_labels, treeid);
+        unique_labels, treeid, seed);
   if (in_tempmem == nullptr) {
     tempmem.reset();
   }
@@ -419,13 +422,13 @@ void DecisionTreeClassifier<T>::fit(
   const raft::handle_t &handle, const T *data, const int ncols, const int nrows,
   const int *labels, unsigned int *rowids, const int n_sampled_rows,
   const int unique_labels, TreeMetaDataNode<T, int> *&tree,
-  DecisionTreeParams tree_parameters,
+  DecisionTreeParams tree_parameters, uint64_t seed,
   std::shared_ptr<TemporaryMemory<T, int>> in_tempmem) {
   this->tree_params = tree_parameters;
   this->base_fit(handle.get_device_allocator(), handle.get_host_allocator(),
                  handle.get_stream(), data, ncols, nrows, labels, rowids,
                  n_sampled_rows, unique_labels, tree->sparsetree, tree->treeid,
-                 true, in_tempmem);
+                 seed, true, in_tempmem);
   this->set_metadata(tree);
 }
 
@@ -437,12 +440,12 @@ void DecisionTreeClassifier<T>::fit(
   const cudaStream_t stream_in, const T *data, const int ncols, const int nrows,
   const int *labels, unsigned int *rowids, const int n_sampled_rows,
   const int unique_labels, TreeMetaDataNode<T, int> *&tree,
-  DecisionTreeParams tree_parameters,
+  DecisionTreeParams tree_parameters, uint64_t seed,
   std::shared_ptr<TemporaryMemory<T, int>> in_tempmem) {
   this->tree_params = tree_parameters;
   this->base_fit(device_allocator_in, host_allocator_in, stream_in, data, ncols,
                  nrows, labels, rowids, n_sampled_rows, unique_labels,
-                 tree->sparsetree, tree->treeid, true, in_tempmem);
+                 tree->sparsetree, tree->treeid, seed, true, in_tempmem);
   this->set_metadata(tree);
 }
 
@@ -451,11 +454,11 @@ void DecisionTreeRegressor<T>::fit(
   const raft::handle_t &handle, const T *data, const int ncols, const int nrows,
   const T *labels, unsigned int *rowids, const int n_sampled_rows,
   TreeMetaDataNode<T, T> *&tree, DecisionTreeParams tree_parameters,
-  std::shared_ptr<TemporaryMemory<T, T>> in_tempmem) {
+  uint64_t seed, std::shared_ptr<TemporaryMemory<T, T>> in_tempmem) {
   this->tree_params = tree_parameters;
   this->base_fit(handle.get_device_allocator(), handle.get_host_allocator(),
                  handle.get_stream(), data, ncols, nrows, labels, rowids,
-                 n_sampled_rows, 1, tree->sparsetree, tree->treeid, false,
+                 n_sampled_rows, 1, tree->sparsetree, tree->treeid, seed, false,
                  in_tempmem);
   this->set_metadata(tree);
 }
@@ -467,11 +470,11 @@ void DecisionTreeRegressor<T>::fit(
   const cudaStream_t stream_in, const T *data, const int ncols, const int nrows,
   const T *labels, unsigned int *rowids, const int n_sampled_rows,
   TreeMetaDataNode<T, T> *&tree, DecisionTreeParams tree_parameters,
-  std::shared_ptr<TemporaryMemory<T, T>> in_tempmem) {
+  uint64_t seed, std::shared_ptr<TemporaryMemory<T, T>> in_tempmem) {
   this->tree_params = tree_parameters;
   this->base_fit(device_allocator_in, host_allocator_in, stream_in, data, ncols,
                  nrows, labels, rowids, n_sampled_rows, 1, tree->sparsetree,
-                 tree->treeid, false, in_tempmem);
+                 tree->treeid, seed, false, in_tempmem);
   this->set_metadata(tree);
 }
 
diff --git a/cpp/src/decisiontree/decisiontree_impl.h b/cpp/src/decisiontree/decisiontree_impl.h
index 0004a3248f..7b7b65822c 100644
--- a/cpp/src/decisiontree/decisiontree_impl.h
+++ b/cpp/src/decisiontree/decisiontree_impl.h
@@ -90,7 +90,7 @@ class DecisionTreeBase {
   void plant(std::vector<SparseTreeNode<T, L>> &sparsetree, const T *data,
              const int ncols, const int nrows, const L *labels,
              unsigned int *rowids, const int n_sampled_rows, int unique_labels,
-             const int treeid);
+             const int treeid, uint64_t seed);
 
   virtual void grow_deep_tree(
     const T *data, const L *labels, unsigned int *rowids,
@@ -105,7 +105,8 @@ class DecisionTreeBase {
     const int nrows, const L *labels, unsigned int *rowids,
     const int n_sampled_rows, int unique_labels,
     std::vector<SparseTreeNode<T, L>> &sparsetree, const int treeid,
-    bool is_classifier, std::shared_ptr<TemporaryMemory<T, L>> in_tempmem);
+    uint64_t seed, bool is_classifier,
+    std::shared_ptr<TemporaryMemory<T, L>> in_tempmem);
 
  public:
   // Printing utility for high level tree info.
@@ -138,6 +139,7 @@ class DecisionTreeClassifier : public DecisionTreeBase<T, int> {
            const int nrows, const int *labels, unsigned int *rowids,
            const int n_sampled_rows, const int unique_labels,
            TreeMetaDataNode<T, int> *&tree, DecisionTreeParams tree_parameters,
+           uint64_t seed,
            std::shared_ptr<TemporaryMemory<T, int>> in_tempmem = nullptr);
 
   //This fit fucntion does not take handle , used by RF
@@ -147,7 +149,7 @@ class DecisionTreeClassifier : public DecisionTreeBase<T, int> {
            const int nrows, const int *labels, unsigned int *rowids,
            const int n_sampled_rows, const int unique_labels,
            TreeMetaDataNode<T, int> *&tree, DecisionTreeParams tree_parameters,
-           std::shared_ptr<TemporaryMemory<T, int>> in_tempmem);
+           uint64_t seed, std::shared_ptr<TemporaryMemory<T, int>> in_tempmem);
 
  private:
   void grow_deep_tree(const T *data, const int *labels, unsigned int *rowids,
@@ -165,7 +167,7 @@ class DecisionTreeRegressor : public DecisionTreeBase<T, T> {
   void fit(const raft::handle_t &handle, const T *data, const int ncols,
            const int nrows, const T *labels, unsigned int *rowids,
            const int n_sampled_rows, TreeMetaDataNode<T, T> *&tree,
-           DecisionTreeParams tree_parameters,
+           DecisionTreeParams tree_parameters, uint64_t seed,
            std::shared_ptr<TemporaryMemory<T, T>> in_tempmem = nullptr);
 
   //This fit function does not take handle. Used by RF
@@ -174,7 +176,7 @@ class DecisionTreeRegressor : public DecisionTreeBase<T, T> {
            const cudaStream_t stream_in, const T *data, const int ncols,
            const int nrows, const T *labels, unsigned int *rowids,
            const int n_sampled_rows, TreeMetaDataNode<T, T> *&tree,
-           DecisionTreeParams tree_parameters,
+           DecisionTreeParams tree_parameters, uint64_t seed,
            std::shared_ptr<TemporaryMemory<T, T>> in_tempmem);
 
  private:
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 52c4b4153c..4438bbfa60 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -162,7 +162,8 @@ void postprocess_labels(int n_rows, std::vector<int>& labels,
  * @param[in] cfg_n_streams: No of parallel CUDA for training forest
  */
 void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
-                   float cfg_max_samples, int cfg_seed, int cfg_n_streams) {
+                   float cfg_max_samples, uint64_t cfg_seed,
+                   int cfg_n_streams) {
   params.n_trees = cfg_n_trees;
   params.bootstrap = cfg_bootstrap;
   params.max_samples = cfg_max_samples;
@@ -186,7 +187,8 @@ void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
  * @param[in] cfg_tree_params: tree parameters
  */
 void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap,
-                       float cfg_max_samples, int cfg_seed, int cfg_n_streams,
+                       float cfg_max_samples, uint64_t cfg_seed,
+                       int cfg_n_streams,
                        DecisionTree::DecisionTreeParams cfg_tree_params) {
   params.n_trees = cfg_n_trees;
   params.bootstrap = cfg_bootstrap;
@@ -651,7 +653,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features,
                            int n_bins, int split_algo, int min_samples_leaf,
                            int min_samples_split, float min_impurity_decrease,
                            bool bootstrap_features, bool bootstrap, int n_trees,
-                           float max_samples, int seed,
+                           float max_samples, uint64_t seed,
                            CRITERION split_criterion, bool quantile_per_tree,
                            int cfg_n_streams, bool use_experimental_backend,
                            int max_batch_size) {
diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh
index e86719935b..04a0634749 100644
--- a/cpp/src/randomforest/randomforest_impl.cuh
+++ b/cpp/src/randomforest/randomforest_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ void rf<T, L>::prepare_fit_per_tree(
   const int num_sms, const cudaStream_t stream,
   const std::shared_ptr<deviceAllocator> device_allocator) {
   int rs = tree_id;
-  if (rf_params.seed > -1) rs = rf_params.seed + tree_id;
+  if (rf_params.seed != 0) rs = rf_params.seed + tree_id;
 
   raft::random::Rng rng(rs * 1000 | 0xFF00AA,
                         raft::random::GeneratorType::GenKiss99);
@@ -230,7 +230,8 @@ void rfClassifier<T>::fit(const raft::handle_t& user_handle, const T* input,
     trees[i].fit(handle.get_device_allocator(), handle.get_host_allocator(),
                  tempmem[stream_id]->stream, input, n_cols, n_rows, labels,
                  rowids, n_sampled_rows, n_unique_labels, tree_ptr,
-                 this->rf_params.tree_params, tempmem[stream_id]);
+                 this->rf_params.tree_params, this->rf_params.seed,
+                 tempmem[stream_id]);
   }
   //Cleanup
   for (int i = 0; i < n_streams; i++) {
@@ -506,7 +507,7 @@ void rfRegressor<T>::fit(const raft::handle_t& user_handle, const T* input,
     trees[i].fit(handle.get_device_allocator(), handle.get_host_allocator(),
                  tempmem[stream_id]->stream, input, n_cols, n_rows, labels,
                  rowids, n_sampled_rows, tree_ptr, this->rf_params.tree_params,
-                 tempmem[stream_id]);
+                 this->rf_params.seed, tempmem[stream_id]);
   }
   //Cleanup
   for (int i = 0; i < n_streams; i++) {
diff --git a/cpp/test/sg/decisiontree_batchedlevel_algo.cu b/cpp/test/sg/decisiontree_batchedlevel_algo.cu
index 3528c1213d..7a8681a25e 100644
--- a/cpp/test/sg/decisiontree_batchedlevel_algo.cu
+++ b/cpp/test/sg/decisiontree_batchedlevel_algo.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,8 +67,6 @@ class DtBaseTest : public ::testing::TestWithParam<DtTestParams> {
     allocator->deallocate(tmp, sizeof(T) * inparams.M * inparams.N, stream);
     rowids = (I*)allocator->allocate(sizeof(I) * inparams.M, stream);
     MLCommon::iota(rowids, 0, 1, inparams.M, stream);
-    colids = (I*)allocator->allocate(sizeof(I) * inparams.N, stream);
-    MLCommon::iota(colids, 0, 1, inparams.N, stream);
     quantiles =
       (T*)allocator->allocate(sizeof(T) * inparams.nbins * inparams.N, stream);
 
@@ -86,7 +84,6 @@ class DtBaseTest : public ::testing::TestWithParam<DtTestParams> {
     allocator->deallocate(data, sizeof(T) * inparams.M * inparams.N, stream);
     allocator->deallocate(labels, sizeof(L) * inparams.M, stream);
     allocator->deallocate(rowids, sizeof(int) * inparams.M, stream);
-    allocator->deallocate(colids, sizeof(int) * inparams.N, stream);
     allocator->deallocate(quantiles, sizeof(T) * inparams.nbins * inparams.N,
                           stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -98,7 +95,7 @@ class DtBaseTest : public ::testing::TestWithParam<DtTestParams> {
   std::shared_ptr<raft::handle_t> handle;
   T *data, *quantiles;
   L* labels;
-  I *rowids, *colids;
+  I* rowids;
   DecisionTreeParams params;
   DtTestParams inparams;
   std::vector<SparseTreeNode<T, L>> sparsetree;
@@ -129,8 +126,8 @@ typedef DtClassifierTest<float> DtClsTestF;
 TEST_P(DtClsTestF, Test) {
   int num_leaves, depth;
   grow_tree<float, int, int>(
-    handle->get_device_allocator(), handle->get_host_allocator(), data,
-    inparams.N, inparams.M, labels, quantiles, rowids, colids, inparams.M,
+    handle->get_device_allocator(), handle->get_host_allocator(), data, 1, 0,
+    inparams.N, inparams.M, labels, quantiles, rowids, inparams.M,
     inparams.nclasses, params, stream, sparsetree, num_leaves, depth);
   // this is a "well behaved" dataset!
   ASSERT_EQ(depth, 1);
@@ -162,10 +159,10 @@ typedef DtRegressorTest<float> DtRegTestF;
 ///@todo: add checks
 TEST_P(DtRegTestF, Test) {
   int num_leaves, depth;
-  grow_tree<float, int>(
-    handle->get_device_allocator(), handle->get_host_allocator(), data,
-    inparams.N, inparams.M, labels, quantiles, rowids, colids, inparams.M, 0,
-    params, stream, sparsetree, num_leaves, depth);
+  grow_tree<float, int>(handle->get_device_allocator(),
+                        handle->get_host_allocator(), data, 1, 0, inparams.N,
+                        inparams.M, labels, quantiles, rowids, inparams.M, 0,
+                        params, stream, sparsetree, num_leaves, depth);
   // goes all the way to max-depth
   ASSERT_EQ(depth, inparams.max_depth);
 }
diff --git a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu
index c9f47cfee4..3bd5e0a58a 100644
--- a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu
+++ b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,8 +77,6 @@ class BatchedLevelAlgoUnitTestFixture {
       static_cast<LabelT*>(d_allocator->allocate(sizeof(LabelT) * n_row, 0));
     row_ids =
       static_cast<IdxT*>(d_allocator->allocate(sizeof(IdxT) * n_row, 0));
-    col_ids =
-      static_cast<IdxT*>(d_allocator->allocate(sizeof(IdxT) * n_col, 0));
 
     // Nodes that exist prior to the invocation of nodeSplitKernel()
     curr_nodes =
@@ -99,7 +97,6 @@ class BatchedLevelAlgoUnitTestFixture {
     raft::update_device(data, h_data.data(), n_row * n_col, 0);
     raft::update_device(labels, h_labels.data(), n_row, 0);
     MLCommon::iota(row_ids, 0, 1, n_row, 0);
-    MLCommon::iota(col_ids, 0, 1, n_col, 0);
 
     tempmem = std::make_shared<TemporaryMemory<DataT, LabelT>>(
       *raft_handle, cudaStream_t(0), n_row, n_col, 0, params);
@@ -117,7 +114,6 @@ class BatchedLevelAlgoUnitTestFixture {
     input.nSampledRows = n_row;
     input.nSampledCols = n_col;
     input.rowids = row_ids;
-    input.colids = col_ids;
     input.nclasses = 0;  // not applicable for regression
     input.quantiles = quantiles;
   }
@@ -127,7 +123,6 @@ class BatchedLevelAlgoUnitTestFixture {
     d_allocator->deallocate(data, sizeof(DataT) * n_row * n_col, 0);
     d_allocator->deallocate(labels, sizeof(LabelT) * n_row, 0);
     d_allocator->deallocate(row_ids, sizeof(IdxT) * n_row, 0);
-    d_allocator->deallocate(col_ids, sizeof(IdxT) * n_col, 0);
     d_allocator->deallocate(curr_nodes, sizeof(NodeT) * max_batch, 0);
     d_allocator->deallocate(new_nodes, sizeof(NodeT) * 2 * max_batch, 0);
     d_allocator->deallocate(n_new_nodes, sizeof(IdxT), 0);
@@ -157,7 +152,6 @@ class BatchedLevelAlgoUnitTestFixture {
   DataT* data;
   DataT* labels;
   IdxT* row_ids;
-  IdxT* col_ids;
 };
 
 class TestQuantiles : public ::testing::TestWithParam<NoOpParams>,
@@ -319,7 +313,9 @@ TEST_P(TestMetric, RegressionMetricGain) {
       pred, pred2, pred2P, pred_count, n_bins, params.max_depth,
       params.min_samples_split, params.min_samples_leaf,
       params.min_impurity_decrease, params.max_leaves, input, curr_nodes, 0,
-      done_count, mutex, n_new_leaves, splits, block_sync, split_criterion);
+      done_count, mutex, n_new_leaves, splits, block_sync, split_criterion, 0,
+      1234ULL);
+
   raft::update_host(h_splits.data(), splits, 1, 0);
   CUDA_CHECK(cudaGetLastError());
   CUDA_CHECK(cudaStreamSynchronize(0));
diff --git a/cpp/test/sg/rf_accuracy_test.cu b/cpp/test/sg/rf_accuracy_test.cu
index f3ee0b2487..7ab48d60b6 100644
--- a/cpp/test/sg/rf_accuracy_test.cu
+++ b/cpp/test/sg/rf_accuracy_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,7 +99,7 @@ class RFClassifierAccuracyTest : public ::testing::TestWithParam<RFInputs> {
     set_all_rf_params(rfp, 1, /* n_trees */
                       true,   /* bootstrap */
                       1.0,    /* max_samples */
-                      -1,     /* seed */
+                      0,      /* seed */
                       1,      /* n_streams */
                       tree_params);
   }
diff --git a/cpp/test/sg/rf_batched_classification_test.cu b/cpp/test/sg/rf_batched_classification_test.cu
index b39bebab5f..4677d5732d 100644
--- a/cpp/test/sg/rf_batched_classification_test.cu
+++ b/cpp/test/sg/rf_batched_classification_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ class RFBatchedClsTest : public ::testing::TestWithParam<RfInputs> {
                     params.split_criterion, false, true);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
 
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.reset(new raft::handle_t(rf_params.n_streams));
diff --git a/cpp/test/sg/rf_batched_regression_test.cu b/cpp/test/sg/rf_batched_regression_test.cu
index 9c66586621..972b610cdd 100644
--- a/cpp/test/sg/rf_batched_regression_test.cu
+++ b/cpp/test/sg/rf_batched_regression_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ class RFBatchedRegTest : public ::testing::TestWithParam<RfInputs> {
                     params.split_criterion, false, true);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
 
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.reset(new raft::handle_t(rf_params.n_streams));
diff --git a/cpp/test/sg/rf_depth_test.cu b/cpp/test/sg/rf_depth_test.cu
index fa69d393e3..d4204343fe 100644
--- a/cpp/test/sg/rf_depth_test.cu
+++ b/cpp/test/sg/rf_depth_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ class RfClassifierDepthTest : public ::testing::TestWithParam<int> {
                     params.split_criterion, false);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
 
     int data_len = params.n_rows * params.n_cols;
     raft::allocate(data, data_len);
@@ -169,7 +169,7 @@ class RfRegressorDepthTest : public ::testing::TestWithParam<int> {
                     params.split_criterion, false);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
 
     int data_len = params.n_rows * params.n_cols;
     raft::allocate(data, data_len);
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index 71a69b72e8..b4451cdd05 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ class RfClassifierTest : public ::testing::TestWithParam<RfInputs<T>> {
                     params.split_criterion, false);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
     //print(rf_params);
 
     //--------------------------------------------------------
@@ -167,7 +167,7 @@ class RfRegressorTest : public ::testing::TestWithParam<RfInputs<T>> {
                     params.split_criterion, false);
     RF_params rf_params;
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
     //print(rf_params);
 
     //--------------------------------------------------------
diff --git a/cpp/test/sg/rf_treelite_test.cu b/cpp/test/sg/rf_treelite_test.cu
index b0dded3e88..31a6f4f9d0 100644
--- a/cpp/test/sg/rf_treelite_test.cu
+++ b/cpp/test/sg/rf_treelite_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -190,7 +190,7 @@ class RfTreeliteTestCommon : public ::testing::TestWithParam<RfInputs<T>> {
                     params.min_impurity_decrease, params.bootstrap_features,
                     params.split_criterion, false);
     set_all_rf_params(rf_params, params.n_trees, params.bootstrap,
-                      params.max_samples, -1, params.n_streams, tree_params);
+                      params.max_samples, 0, params.n_streams, tree_params);
     handle.reset(new raft::handle_t(rf_params.n_streams));
 
     data_len = params.n_rows * params.n_cols;
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index 227ab56f41..c2b45cd8b4 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ import numpy as np
 import warnings
 
 from libcpp cimport bool
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, uint64_t
 from libc.stdlib cimport calloc, malloc, free
 from libcpp.vector cimport vector
 from libcpp.string cimport string
@@ -115,7 +115,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                                     bool,
                                     int,
                                     float,
-                                    int,
+                                    uint64_t,
                                     CRITERION,
                                     bool,
                                     int,
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 457b0ec1e6..7945068ef3 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -1,6 +1,6 @@
 
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ from cython.operator cimport dereference as deref
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, uint64_t
 from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
@@ -230,7 +230,6 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
         If set to true and  following conditions are also met, experimental
          decision tree training implementation would be used:
             split_algo = 1 (GLOBAL_QUANTILE)
-            max_features = 1.0 (Feature sub-sampling disabled)
             quantile_per_tree = false (No per tree quantile computation)
     max_batch_size: int (default = 128)
         Maximum number of nodes that can be processed in a given batch. This is
@@ -473,7 +472,7 @@ class RandomForestClassifier(BaseRandomForestModel, ClassifierMixin):
                                      <bool> self.bootstrap,
                                      <int> self.n_estimators,
                                      <float> self.max_samples,
-                                     <int> seed_val,
+                                     <uint64_t> seed_val,
                                      <CRITERION> self.split_criterion,
                                      <bool> self.quantile_per_tree,
                                      <int> self.n_streams,
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 8e4301907b..0b63a1c46d 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ from cython.operator cimport dereference as deref
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, uint64_t
 from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
@@ -222,7 +222,6 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin):
         If set to true and  following conditions are also met, experimental
          decision tree training implementation would be used:
             split_algo = 1 (GLOBAL_QUANTILE)
-            max_features = 1.0 (Feature sub-sampling disabled)
             quantile_per_tree = false (No per tree quantile computation)
     max_batch_size: int (default = 128)
         Maximum number of nodes that can be processed in a given batch. This is
@@ -453,7 +452,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin):
                                      <bool> self.bootstrap,
                                      <int> self.n_estimators,
                                      <float> self.max_samples,
-                                     <int> seed_val,
+                                     <uint64_t> seed_val,
                                      <CRITERION> self.split_criterion,
                                      <bool> self.quantile_per_tree,
                                      <int> self.n_streams,
diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py
index 119cf1c5de..f03d0b7d5c 100644
--- a/python/cuml/test/test_random_forest.py
+++ b/python/cuml/test/test_random_forest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -173,10 +173,6 @@ def test_rf_classification(small_clf, datatype, split_algo,
     captured_stdout = f.getvalue()
     if use_experimental_backend:
         is_fallback_used = False
-        if max_features != 1.0:
-            assert ('Experimental backend does not yet support feature ' +
-                    'sub-sampling' in captured_stdout)
-            is_fallback_used = True
         if split_algo != 1:
             assert ('Experimental backend does not yet support histogram ' +
                     'split algorithm' in captured_stdout)

From 28953ab8144dfd0d74f5c90540248a9e2705523c Mon Sep 17 00:00:00 2001
From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com>
Date: Tue, 2 Feb 2021 12:17:54 -0700
Subject: [PATCH 04/29] Improve Python Docs with Default Role (#3445)

This PR sets the default type of "interpolated text" in sphinx to `:py:obj:`. This is very useful for us since we frequently use a single backtick in our python documentation to refer to another python object.

Currently, the docstring:
```
`cuml.datasets.make_blobs`
```
Would generate (italicized, variable spaced):
![image](https://user-images.githubusercontent.com/42954918/106529509-dd4c1900-64a7-11eb-9977-49a2594c7c3e.png)

This PR changes it to (bold, mono spaced):
![image](https://user-images.githubusercontent.com/42954918/106529282-77f82800-64a7-11eb-86e2-a38e37ccea0d.png)

The added benefit here is if the interpolated text is found in the index, it will link to that section. So in the above example, clicking on `cuml.datasets.make_blobs` will take you to the function documentation.

Finally, this PR adds a new type of interpolated text role: `:py:` . This should be used for inline python code. For example, the following code:
```
 * `cuml.cuml.datasets.make_blobs` for references do objects (functions, classes, modules, etc.)
 * :py:`import cupy as cp` for inline python code
 * ``import cupy as cp`` for literal code
```
will generate:
![image](https://user-images.githubusercontent.com/42954918/106530276-3f594e00-64a9-11eb-8edf-569fc9dd829e.png)

I also looked for a few examples to replace to help seed usage of these new options. Updating every location would be very time consuming and is best done over time.

Authors:
  - Michael Demoret (@mdemoret-nv)

Approvers:
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/cuml/pull/3445
---
 docs/source/api.rst                          |  5 +++
 docs/source/conf.py                          | 12 +++++--
 python/cuml/common/input_utils.py            |  6 ++--
 python/cuml/dask/datasets/classification.py  | 16 ++++-----
 python/cuml/datasets/blobs.py                | 10 +++---
 python/cuml/datasets/classification.py       | 34 +++++++++---------
 python/cuml/decomposition/incremental_pca.py | 36 ++++++++++----------
 python/cuml/svm/svc.pyx                      |  2 +-
 8 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 41f92d8a0a..c2dc582139 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -2,6 +2,11 @@
 cuML API Reference
 ~~~~~~~~~~~~~~~~~~~
 
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
 Module Configuration
 ====================
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8bc89fa0de..296b4b6f36 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
@@ -182,7 +182,10 @@
 ]
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
+intersphinx_mapping = {
+    "python": ('https://docs.python.org/', None),
+    "scipy": ('https://docs.scipy.org/doc/scipy/reference', None)
+}
 
 # Config numpydoc
 numpydoc_show_inherited_class_members = False
@@ -201,3 +204,8 @@ def setup(app):
     'cuml', 'https://github.com/rapidsai/'
     'cuml/blob/{revision}/python/'
     '{package}/{path}#L{lineno}')
+
+# Set the default role for interpreted code (anything surrounded in `single
+# backticks`) to be a python object. See
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-default_role
+default_role = "py:obj"
\ No newline at end of file
diff --git a/python/cuml/common/input_utils.py b/python/cuml/common/input_utils.py
index b2549f591f..673a45232d 100644
--- a/python/cuml/common/input_utils.py
+++ b/python/cuml/common/input_utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -85,8 +85,8 @@ def get_supported_input_type(X):
     -----
     To closely match the functionality of
     :func:`~cuml.common.input_utils.input_to_cuml_array`, this method will
-    return ``cupy.ndarray`` for any object supporting
-    `__cuda_array_interface__` and ``numpy.ndarray`` for any object supporting
+    return `cupy.ndarray` for any object supporting
+    `__cuda_array_interface__` and `numpy.ndarray` for any object supporting
     `__array_interface__`.
 
     Returns
diff --git a/python/cuml/dask/datasets/classification.py b/python/cuml/dask/datasets/classification.py
index 0c06bf79eb..b7978b423d 100644
--- a/python/cuml/dask/datasets/classification.py
+++ b/python/cuml/dask/datasets/classification.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,17 +46,17 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
 
     This initially creates clusters of points normally distributed (std=1)
     about vertices of an `n_informative`-dimensional hypercube with sides of
-    length ``2 * class_sep`` and assigns an equal number of clusters to each
+    length :py:`2 * class_sep` and assigns an equal number of clusters to each
     class. It introduces interdependence between these features and adds
     various types of further noise to the data.
 
-    Without shuffling, ``X`` horizontally stacks features in the following
+    Without shuffling, `X` horizontally stacks features in the following
     order: the primary `n_informative` features, followed by `n_redundant`
     linear combinations of the informative features, followed by `n_repeated`
     duplicates, drawn randomly with replacement from the informative and
     redundant features. The remaining features are filled with random noise.
     Thus, without shuffling, all useful features are contained in the columns
-    ``X[:, :n_informative + n_redundant + n_repeated]``.
+    :py:`X[:, :n_informative + n_redundant + n_repeated]`.
 
     Examples
     --------
@@ -104,7 +104,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
         The total number of features. These comprise `n_informative`
         informative features, `n_redundant` redundant features,
         `n_repeated` duplicated features and
-        ``n_features-n_informative-n_redundant-n_repeated`` useless features
+        :py:`n_features-n_informative-n_redundant-n_repeated` useless features
         drawn at random.
     n_informative : int, optional (default=2)
         The number of informative features. Each class is composed of a number
@@ -124,10 +124,10 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
         The number of classes (or labels) of the classification problem.
     n_clusters_per_class : int, optional (default=2)
         The number of clusters per class.
-    weights : array-like of shape ``(n_classes,)`` or ``(n_classes - 1,)``, \
-        (default=None)
+    weights : array-like of shape :py:`(n_classes,)` or :py:`(n_classes - 1,)`\
+        , (default=None)
         The proportions of samples assigned to each class. If None, then
-        classes are balanced. Note that if ``len(weights) == n_classes - 1``,
+        classes are balanced. Note that if :py:`len(weights) == n_classes - 1`,
         then the last class weight is automatically inferred.
         More than `n_samples` samples may be returned if the sum of
         `weights` exceeds 1.
diff --git a/python/cuml/datasets/blobs.py b/python/cuml/datasets/blobs.py
index ca3ebf415f..cbb6450e00 100644
--- a/python/cuml/datasets/blobs.py
+++ b/python/cuml/datasets/blobs.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -80,12 +80,12 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
         the number of samples per cluster.
     n_features : int, optional (default=2)
         The number of features for each sample.
-    centers : int or array of shape [n_centers, n_features], optional
+    centers : int or array of shape [`n_centers`, `n_features`], optional
         (default=None)
         The number of centers to generate, or the fixed center locations.
-        If n_samples is an int and centers is None, 3 centers are generated.
-        If n_samples is array-like, centers must be
-        either None or an array of length equal to the length of n_samples.
+        If `n_samples` is an int and centers is None, 3 centers are generated.
+        If `n_samples` is array-like, centers must be
+        either None or an array of length equal to the length of `n_samples`.
     cluster_std : float or sequence of floats, optional (default=1.0)
         The standard deviation of the clusters.
     center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
diff --git a/python/cuml/datasets/classification.py b/python/cuml/datasets/classification.py
index 236d4dc526..c6e927228d 100644
--- a/python/cuml/datasets/classification.py
+++ b/python/cuml/datasets/classification.py
@@ -54,17 +54,17 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     """
     Generate a random n-class classification problem.
     This initially creates clusters of points normally distributed (std=1)
-    about vertices of an ``n_informative``-dimensional hypercube with sides of
-    length ``2*class_sep`` and assigns an equal number of clusters to each
+    about vertices of an `n_informative`-dimensional hypercube with sides of
+    length :py:`2*class_sep` and assigns an equal number of clusters to each
     class. It introduces interdependence between these features and adds
     various types of further noise to the data.
-    Without shuffling, ``X`` horizontally stacks features in the following
-    order: the primary ``n_informative`` features, followed by ``n_redundant``
-    linear combinations of the informative features, followed by ``n_repeated``
+    Without shuffling, `X` horizontally stacks features in the following
+    order: the primary `n_informative` features, followed by `n_redundant`
+    linear combinations of the informative features, followed by `n_repeated`
     duplicates, drawn randomly with replacement from the informative and
     redundant features. The remaining features are filled with random noise.
     Thus, without shuffling, all useful features are contained in the columns
-    ``X[:, :n_informative + n_redundant + n_repeated]``.
+    :py:`X[:, :n_informative + n_redundant + n_repeated]`.
 
     Examples
     --------
@@ -106,15 +106,15 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     n_samples : int, optional (default=100)
         The number of samples.
     n_features : int, optional (default=20)
-        The total number of features. These comprise ``n_informative``
-        informative features, ``n_redundant`` redundant features,
-        ``n_repeated`` duplicated features and
-        ``n_features-n_informative-n_redundant-n_repeated`` useless features
+        The total number of features. These comprise `n_informative`
+        informative features, `n_redundant` redundant features,
+        `n_repeated` duplicated features and
+        :py:`n_features-n_informative-n_redundant-n_repeated` useless features
         drawn at random.
     n_informative : int, optional (default=2)
         The number of informative features. Each class is composed of a number
         of gaussian clusters each located around the vertices of a hypercube
-        in a subspace of dimension ``n_informative``. For each cluster,
+        in a subspace of dimension `n_informative`. For each cluster,
         informative features are drawn independently from  N(0, 1) and then
         randomly linearly combined within each cluster in order to add
         covariance. The clusters are then placed on the vertices of the
@@ -132,10 +132,10 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     weights : array-like of shape (n_classes,) or (n_classes - 1,),\
               (default=None)
         The proportions of samples assigned to each class. If None, then
-        classes are balanced. Note that if ``len(weights) == n_classes - 1``,
+        classes are balanced. Note that if :py:`len(weights) == n_classes - 1`,
         then the last class weight is automatically inferred.
-        More than ``n_samples`` samples may be returned if the sum of
-        ``weights`` exceeds 1.
+        More than `n_samples` samples may be returned if the sum of
+        `weights` exceeds 1.
     flip_y : float, optional (default=0.01)
         The fraction of samples whose class is assigned randomly. Larger
         values introduce noise in the labels and make the classification
@@ -188,7 +188,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
            time for each feature class (informative, repeated, etc.) while
            also providing the added speedup of generating a big matrix
            on GPU
-        2. We generate `order=F` construction. We exploit the
+        2. We generate :py:`order=F` construction. We exploit the
            fact that X is a generated from a univariate normal, and
            covariance is introduced with matrix multiplications. Which means,
            we can generate X as a 1D array and just reshape it to the
@@ -196,8 +196,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
            copies
         3. Lastly, we also shuffle by construction. Centroid indices are
            permuted for each sample, and then we construct the data for
-           each centroid. This shuffle works for both `order=C` and
-           `order=F` and eliminates any need for secondary copies
+           each centroid. This shuffle works for both :py:`order=C` and
+           :py:`order=F` and eliminates any need for secondary copies
 
     References
     ----------
diff --git a/python/cuml/decomposition/incremental_pca.py b/python/cuml/decomposition/incremental_pca.py
index 4bad91e08c..3538c89ddc 100644
--- a/python/cuml/decomposition/incremental_pca.py
+++ b/python/cuml/decomposition/incremental_pca.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,15 +39,15 @@ class IncrementalPCA(PCA):
     Depending on the size of the input data, this algorithm can be much
     more memory efficient than a PCA, and allows sparse input.
     This algorithm has constant memory complexity, on the order of
-    ``batch_size * n_features``, enabling use of np.memmap files without
+    :py:`batch_size * n_features`, enabling use of np.memmap files without
     loading the entire file into memory. For sparse matrices, the input
     is converted to dense in batches (in order to be able to subtract the
     mean) which avoids storing the entire dense matrix at any one time.
     The computational overhead of each SVD is
-    ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples
-    remain in memory at a time. There will be ``n_samples / batch_size``
+    :py:`O(batch_size * n_features ** 2)`, but only 2 * batch_size samples
+    remain in memory at a time. There will be :py:`n_samples / batch_size`
     SVD computations to get the principal components, versus 1 large SVD
-    of complexity ``O(n_samples * n_features ** 2)`` for PCA.
+    of complexity :py:`O(n_samples * n_features ** 2)` for PCA.
 
     Parameters
     ----------
@@ -60,8 +60,8 @@ class IncrementalPCA(PCA):
         handles in several streams.
         If it is None, a new one is created.
     n_components : int or None, (default=None)
-        Number of components to keep. If ``n_components`` is ``None``,
-        then ``n_components`` is set to ``min(n_samples, n_features)``.
+        Number of components to keep. If `n_components` is ``None``,
+        then `n_components` is set to :py:`min(n_samples, n_features)`.
     whiten : bool, optional
         If True, de-correlates the components. This is done by dividing them by
         the corresponding singular values then multiplying by sqrt(n_samples).
@@ -69,12 +69,12 @@ class IncrementalPCA(PCA):
         multi-collinearity. It might be beneficial for downstream
         tasks like LinearRegression where correlated features cause problems.
     copy : bool, (default=True)
-        If False, X will be overwritten. ``copy=False`` can be used to
+        If False, X will be overwritten. :py:`copy=False` can be used to
         save memory but is unsafe for general use.
     batch_size : int or None, (default=None)
         The number of samples to use for each batch. Only used when calling
-        ``fit``. If ``batch_size`` is ``None``, then ``batch_size``
-        is inferred from the data and set to ``5 * n_features``, to provide a
+        `fit`. If `batch_size` is ``None``, then `batch_size`
+        is inferred from the data and set to :py:`5 * n_features`, to provide a
         balance between approximation accuracy and memory consumption.
     verbose : int or boolean, default=False
         Sets logging level. It must be one of `cuml.common.logger.level_*`.
@@ -98,24 +98,24 @@ class IncrementalPCA(PCA):
         to 1.0.
     singular_values_ : array, shape (n_components,)
         The singular values corresponding to each of the selected components.
-        The singular values are equal to the 2-norms of the ``n_components``
+        The singular values are equal to the 2-norms of the `n_components`
         variables in the lower-dimensional space.
     mean_ : array, shape (n_features,)
-        Per-feature empirical mean, aggregate over calls to ``partial_fit``.
+        Per-feature empirical mean, aggregate over calls to `partial_fit`.
     var_ : array, shape (n_features,)
         Per-feature empirical variance, aggregate over calls to
-        ``partial_fit``.
+        `partial_fit`.
     noise_variance_ : float
         The estimated noise covariance following the Probabilistic PCA model
         from [4]_.
     n_components_ : int
         The estimated number of components. Relevant when
-        ``n_components=None``.
+        `n_components=None`.
     n_samples_seen_ : int
         The number of samples processed by the estimator. Will be reset on
-        new calls to fit, but increments across ``partial_fit`` calls.
+        new calls to fit, but increments across `partial_fit` calls.
     batch_size_ : int
-        Inferred batch size from ``batch_size``.
+        Inferred batch size from `batch_size`.
 
     Notes
     -----
@@ -126,8 +126,8 @@ class IncrementalPCA(PCA):
     decomposition used in specific situations to reduce the algorithmic
     complexity of the SVD. The source for this technique is [3]_. This
     technique has been omitted because it is advantageous only when decomposing
-    a matrix with ``n_samples >= 5/3 * n_features`` where ``n_samples`` and
-    ``n_features`` are the matrix rows and columns, respectively. In addition,
+    a matrix with :py:`n_samples >= 5/3 * n_features` where `n_samples` and
+    `n_features` are the matrix rows and columns, respectively. In addition,
     it hurts the readability of the implemented algorithm. This would be a good
     opportunity for future optimization, if it is deemed necessary.
 
diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx
index 596908d086..04d30aceb8 100644
--- a/python/cuml/svm/svc.pyx
+++ b/python/cuml/svm/svc.pyx
@@ -216,7 +216,7 @@ class SVC(SVMBase, ClassifierMixin):
     coef_ : float, shape (1, n_cols)
         Only available for linear kernels. It is the normal of the
         hyperplane.
-        ``coef_ = sum_k=1..n_support dual_coef_[k] * support_vectors[k,:]``
+        coef_ = sum_k=1..n_support dual_coef_[k] * support_vectors[k,:]
     classes_: shape (n_classes_,)
         Array of class labels
     n_classes_ : int

From 8201a33657697a2ef3c8f8587322797c63e2c83b Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 2 Feb 2021 20:56:09 +0100
Subject: [PATCH 05/29] MNMG DBSCAN (#3382)

This Pull Request adds initial support for multi-node multi-GPU DBSCAN, and fixes the bugs identified in #3094.

It works by copying the dataset on all the workers and giving ownership of a subset of points to each one. The workers compute a partial clustering with the knowledge of the relationships between their points and the rest of the dataset, and the partial clusterings are merged to form the final labeling. This merging algorithm is also used to accumulate the results in case a batch-wise approach is used on a worker to limit the memory consumption.

The multi-GPU implementation gives great speedups for large datasets, while for small datasets the performance is dominated by the Dask launch overhead, as shown in the figure below:

![mnmg_dbscan_perf](https://user-images.githubusercontent.com/17441062/104958437-55a6da80-59d0-11eb-8a18-fcca0d69c41b.png)

Notes:

- I have renamed variables in the DBSCAN implementation to match our style conventions (snake case). Sorry for the noise that it adds to this PR.
- I refactored some CSR tests to accept multiple test cases instead of hardcoded ones, in order to add corner cases to weak CC. PR #3157 by @cjnolet changed the location of these tests, so I moved those that I had already refactored accordingly. At the moment only the tests that were in `cpp/test/prims/csr.cu` previously have been refactored. I was thinking that the others can be refactored later, I'd like @cjnolet's opinion on this refactoring.
- Regarding testing, the MNMG tests are mostly a copy of the single-GPU ones, though I removed a few tests with very small datasets to avoid problems with MNMG (it doesn't really support the edge case where a worker owns 0 sample, as I think it's a fair assumption that MNMG DBSCAN isn't used with such a tiny dataset).
- Also regarding tests, I changed the comparison function to account for the fact that border points are ambiguous. It assumes that the labeling of core points is minimal in both our implementation and the reference, so if this assumption changes we will need to update the tests accordingly.

If you want to access a pseudo-code description and proof of the new algorithm, feel free to contact me.

Tagging people to whom this PR is relevant: @teju85 @tfeher @MatthiasKohl @canonizer

Authors:
  - Louis Sugy (@Nyrio)

Approvers:
  - Tamas Bela Feher (@tfeher)
  - Corey J. Nolet (@cjnolet)

URL: https://github.com/rapidsai/cuml/pull/3382
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/bench/sg/dbscan.cu                        |  10 +-
 cpp/examples/dbscan/dbscan_example.cpp        |   6 +-
 cpp/include/cuml/cluster/dbscan.hpp           |  39 +--
 cpp/src/dbscan/adjgraph/algo.cuh              |  25 +-
 cpp/src/dbscan/adjgraph/naive.cuh             |  32 +-
 cpp/src/dbscan/adjgraph/pack.h                |  10 +-
 cpp/src/dbscan/adjgraph/runner.cuh            |  15 +-
 cpp/src/dbscan/corepoints/compute.cuh         |  50 ++++
 cpp/src/dbscan/corepoints/exchange.cuh        |  59 ++++
 cpp/src/dbscan/dbscan.cu                      | 115 ++++----
 cpp/src/dbscan/dbscan.cuh                     | 140 +++++----
 cpp/src/dbscan/dbscan_api.cpp                 |  10 +-
 cpp/src/dbscan/dbscan_api.h                   |   2 +-
 cpp/src/dbscan/mergelabels/runner.cuh         |  47 +++
 cpp/src/dbscan/mergelabels/tree_reduction.cuh |  82 ++++++
 cpp/src/dbscan/runner.cuh                     | 274 +++++++++++-------
 cpp/src/dbscan/vertexdeg/algo.cuh             |  15 +-
 cpp/src/dbscan/vertexdeg/naive.cuh            |  34 ++-
 cpp/src/dbscan/vertexdeg/pack.h               |   6 +-
 cpp/src/dbscan/vertexdeg/runner.cuh           |  15 +-
 cpp/src_prims/label/merge_labels.cuh          | 146 ++++++++++
 cpp/src_prims/sparse/csr.cuh                  | 247 ++++++----------
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/prims/csr.cu                         | 243 +++++++++++-----
 cpp/test/prims/csr.h                          |  34 ---
 cpp/test/prims/merge_labels.cu                | 147 ++++++++++
 cpp/test/prims/sparse/add.cu                  | 219 ++++++++------
 cpp/test/prims/sparse/convert_coo.cu          |  93 +++---
 cpp/test/prims/sparse/convert_csr.cu          | 110 +++++--
 cpp/test/prims/sparse/norm.cu                 | 171 +++++------
 cpp/test/prims/sparse/row_op.cu               | 116 +++++---
 cpp/test/sg/dbscan_test.cu                    |  15 +-
 docs/source/api.rst                           |   6 +
 python/cuml/cluster/__init__.py               |   2 +-
 python/cuml/cluster/dbscan.pyx                | 216 +++++++-------
 python/cuml/cluster/dbscan_mg.pyx             |  45 +++
 python/cuml/dask/cluster/__init__.py          |   3 +-
 python/cuml/dask/cluster/dbscan.py            | 158 ++++++++++
 python/cuml/test/dask/test_dbscan.py          | 215 ++++++++++++++
 python/cuml/test/test_dbscan.py               | 149 ++++++----
 python/cuml/test/utils.py                     |  39 +++
 42 files changed, 2303 insertions(+), 1060 deletions(-)
 create mode 100644 cpp/src/dbscan/corepoints/compute.cuh
 create mode 100644 cpp/src/dbscan/corepoints/exchange.cuh
 create mode 100644 cpp/src/dbscan/mergelabels/runner.cuh
 create mode 100644 cpp/src/dbscan/mergelabels/tree_reduction.cuh
 create mode 100644 cpp/src_prims/label/merge_labels.cuh
 delete mode 100644 cpp/test/prims/csr.h
 create mode 100644 cpp/test/prims/merge_labels.cu
 create mode 100644 python/cuml/cluster/dbscan_mg.pyx
 create mode 100644 python/cuml/dask/cluster/dbscan.py
 create mode 100644 python/cuml/test/dask/test_dbscan.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d6cf851aef..2eacef46eb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/bench/sg/dbscan.cu b/cpp/bench/sg/dbscan.cu
index 948dca6a74..44818eb959 100644
--- a/cpp/bench/sg/dbscan.cu
+++ b/cpp/bench/sg/dbscan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,10 +51,10 @@ class Dbscan : public BlobsFixture<D, int> {
       state.SkipWithError("Dbscan only supports row-major inputs");
     }
     this->loopOnState(state, [this, &state]() {
-      dbscanFit(*this->handle, this->data.X, this->params.nrows,
-                this->params.ncols, D(dParams.eps), dParams.min_pts,
-                this->data.y, this->core_sample_indices,
-                dParams.max_bytes_per_batch);
+      ML::Dbscan::fit(*this->handle, this->data.X, this->params.nrows,
+                      this->params.ncols, D(dParams.eps), dParams.min_pts,
+                      this->data.y, this->core_sample_indices,
+                      dParams.max_bytes_per_batch);
       state.SetItemsProcessed(this->params.nrows * this->params.ncols);
     });
   }
diff --git a/cpp/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp
index 273d1fa71e..37722b65d6 100644
--- a/cpp/examples/dbscan/dbscan_example.cpp
+++ b/cpp/examples/dbscan/dbscan_example.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,8 +200,8 @@ int main(int argc, char* argv[]) {
             << "eps - " << eps << std::endl
             << "max_bytes_per_batch - " << max_bytes_per_batch << std::endl;
 
-  ML::dbscanFit(handle, d_inputData, nRows, nCols, eps, minPts, d_labels,
-                nullptr, max_bytes_per_batch, false);
+  ML::Dbscan::fit(handle, d_inputData, nRows, nCols, eps, minPts, d_labels,
+                  nullptr, max_bytes_per_batch, false);
   CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int),
                                cudaMemcpyDeviceToHost, stream));
   CUDA_RT_CALL(cudaStreamSynchronize(stream));
diff --git a/cpp/include/cuml/cluster/dbscan.hpp b/cpp/include/cuml/cluster/dbscan.hpp
index e1a1dbe350..710a7aa423 100644
--- a/cpp/include/cuml/cluster/dbscan.hpp
+++ b/cpp/include/cuml/cluster/dbscan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cuml/cuml.hpp>
 
 namespace ML {
+namespace Dbscan {
 
 /**
  * @defgroup DbscanCpp C++ implementation of Dbscan algo
@@ -39,28 +40,30 @@ namespace ML {
  * @param[in] max_bytes_per_batch the maximum number of megabytes to be used for
  *            each batch of the pairwise distance calculation. This enables the
  *            trade off between memory usage and algorithm execution time.
- * @param[in] verbosity: verbosity level for logging messages during execution
+ * @param[in] verbosity verbosity level for logging messages during execution
+ * @param[in] opg whether we are running in a multi-node multi-GPU context
  * @{
  */
 
-void dbscanFit(const raft::handle_t &handle, float *input, int n_rows,
-               int n_cols, float eps, int min_pts, int *labels,
-               int *core_sample_indices = nullptr,
-               size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO);
-void dbscanFit(const raft::handle_t &handle, double *input, int n_rows,
-               int n_cols, double eps, int min_pts, int *labels,
-               int *core_sample_indices = nullptr,
-               size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO);
+void fit(const raft::handle_t &handle, float *input, int n_rows, int n_cols,
+         float eps, int min_pts, int *labels,
+         int *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0,
+         int verbosity = CUML_LEVEL_INFO, bool opg = false);
+void fit(const raft::handle_t &handle, double *input, int n_rows, int n_cols,
+         double eps, int min_pts, int *labels,
+         int *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0,
+         int verbosity = CUML_LEVEL_INFO, bool opg = false);
 
-void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows,
-               int64_t n_cols, float eps, int min_pts, int64_t *labels,
-               int64_t *core_sample_indices = nullptr,
-               size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO);
-void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows,
-               int64_t n_cols, double eps, int min_pts, int64_t *labels,
-               int64_t *core_sample_indices = nullptr,
-               size_t max_bytes_per_batch = 0, int verbosity = CUML_LEVEL_INFO);
+void fit(const raft::handle_t &handle, float *input, int64_t n_rows,
+         int64_t n_cols, float eps, int min_pts, int64_t *labels,
+         int64_t *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0,
+         int verbosity = CUML_LEVEL_INFO, bool opg = false);
+void fit(const raft::handle_t &handle, double *input, int64_t n_rows,
+         int64_t n_cols, double eps, int min_pts, int64_t *labels,
+         int64_t *core_sample_indices = nullptr, size_t max_bytes_per_batch = 0,
+         int verbosity = CUML_LEVEL_INFO, bool opg = false);
 
 /** @} */
 
+}  // namespace Dbscan
 }  // namespace ML
diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh
index 24a9f3f720..163d943377 100644
--- a/cpp/src/dbscan/adjgraph/algo.cuh
+++ b/cpp/src/dbscan/adjgraph/algo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 
 using namespace thrust;
 
+namespace ML {
 namespace Dbscan {
 namespace AdjGraph {
 namespace Algo {
@@ -38,30 +39,21 @@ static const int TPB_X = 256;
 
 /**
  * Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the
- * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts.
+ * CSR row_ind_ptr array (adj_graph)
  */
 template <typename Index_ = int>
-void launcher(const raft::handle_t &handle, Pack<Index_> data, Index_ batchSize,
-              cudaStream_t stream) {
+void launcher(const raft::handle_t &handle, Pack<Index_> data,
+              Index_ batch_size, cudaStream_t stream) {
   device_ptr<Index_> dev_vd = device_pointer_cast(data.vd);
   device_ptr<Index_> dev_ex_scan = device_pointer_cast(data.ex_scan);
 
   ML::thrustAllocatorAdapter alloc(handle.get_device_allocator(), stream);
   exclusive_scan(thrust::cuda::par(alloc).on(stream), dev_vd,
-                 dev_vd + batchSize, dev_ex_scan);
-
-  bool *core_pts = data.core_pts;
-  int minPts = data.minPts;
-  Index_ *vd = data.vd;
+                 dev_vd + batch_size, dev_ex_scan);
 
   raft::sparse::convert::csr_adj_graph_batched<Index_, TPB_X>(
-    data.ex_scan, data.N, data.adjnnz, batchSize, data.adj, data.adj_graph,
-    stream,
-    [core_pts, minPts, vd] __device__(Index_ row, Index_ start_idx,
-                                      Index_ stop_idx) {
-      // fuse the operation of core points construction
-      core_pts[row] = (vd[row] >= minPts);
-    });
+    data.ex_scan, data.N, data.adjnnz, batch_size, data.adj, data.adj_graph,
+    stream);
 
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -69,3 +61,4 @@ void launcher(const raft::handle_t &handle, Pack<Index_> data, Index_ batchSize,
 }  // namespace Algo
 }  // namespace AdjGraph
 }  // namespace Dbscan
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh
index ae44f5a59e..20d0175863 100644
--- a/cpp/src/dbscan/adjgraph/naive.cuh
+++ b/cpp/src/dbscan/adjgraph/naive.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,47 +23,45 @@
 #include "../common.cuh"
 #include "pack.h"
 
+namespace ML {
 namespace Dbscan {
 namespace AdjGraph {
 namespace Naive {
 
 template <typename Index_ = int>
-void launcher(const raft::handle_t& handle, Pack<Index_> data, Index_ batchSize,
-              cudaStream_t stream) {
+void launcher(const raft::handle_t& handle, Pack<Index_> data,
+              Index_ batch_size, cudaStream_t stream) {
   Index_ k = 0;
   Index_ N = data.N;
   MLCommon::host_buffer<Index_> host_vd(handle.get_host_allocator(), stream,
-                                        batchSize + 1);
-  MLCommon::host_buffer<bool> host_core_pts(handle.get_host_allocator(), stream,
-                                            batchSize);
+                                        batch_size + 1);
   MLCommon::host_buffer<bool> host_adj(handle.get_host_allocator(), stream,
-                                       batchSize * N);
+                                       batch_size * N);
   MLCommon::host_buffer<Index_> host_ex_scan(handle.get_host_allocator(),
-                                             stream, batchSize);
-  raft::update_host(host_adj.data(), data.adj, batchSize * N, stream);
-  raft::update_host(host_vd.data(), data.vd, batchSize + 1, stream);
+                                             stream, batch_size);
+  raft::update_host(host_adj.data(), data.adj, batch_size * N, stream);
+  raft::update_host(host_vd.data(), data.vd, batch_size + 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  size_t adjgraph_size = size_t(host_vd[batchSize]);
+  size_t adjgraph_size = size_t(host_vd[batch_size]);
   MLCommon::host_buffer<Index_> host_adj_graph(handle.get_host_allocator(),
                                                stream, adjgraph_size);
-  for (Index_ i = 0; i < batchSize; i++) {
+  for (Index_ i = 0; i < batch_size; i++) {
     for (Index_ j = 0; j < N; j++) {
+      /// TODO: change layout or remove; cf #3414
       if (host_adj[i * N + j]) {
         host_adj_graph[k] = j;
         k = k + 1;
       }
     }
   }
-  for (Index_ i = 0; i < batchSize; i++)
-    host_core_pts[i] = (host_vd[i] >= data.minPts);
   host_ex_scan[0] = Index_(0);
-  for (Index_ i = 1; i < batchSize; i++)
+  for (Index_ i = 1; i < batch_size; i++)
     host_ex_scan[i] = host_ex_scan[i - 1] + host_vd[i - 1];
   raft::update_device(data.adj_graph, host_adj_graph.data(), adjgraph_size,
                       stream);
-  raft::update_device(data.core_pts, host_core_pts.data(), batchSize, stream);
-  raft::update_device(data.ex_scan, host_ex_scan.data(), batchSize, stream);
+  raft::update_device(data.ex_scan, host_ex_scan.data(), batch_size, stream);
 }
 }  // namespace Naive
 }  // namespace AdjGraph
 }  // namespace Dbscan
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/adjgraph/pack.h b/cpp/src/dbscan/adjgraph/pack.h
index 83bf85c943..4e6eafe101 100644
--- a/cpp/src/dbscan/adjgraph/pack.h
+++ b/cpp/src/dbscan/adjgraph/pack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+namespace ML {
 namespace Dbscan {
 namespace AdjGraph {
 
@@ -36,13 +37,10 @@ struct Pack {
 
   /** exculusive scan generated from vd */
   Index_ *ex_scan;
-  /** array to store whether a vertex is core poType or not */
-  bool *core_pts;
-  /** number of poTypes in the dataset */
+  /** number of points in the dataset */
   Index_ N;
-  /** Minpts for classifying core pts */
-  Index_ minPts;
 };
 
 }  // namespace AdjGraph
 }  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/adjgraph/runner.cuh b/cpp/src/dbscan/adjgraph/runner.cuh
index 90122a3bd5..c9298c7cea 100644
--- a/cpp/src/dbscan/adjgraph/runner.cuh
+++ b/cpp/src/dbscan/adjgraph/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,21 +21,21 @@
 #include "naive.cuh"
 #include "pack.h"
 
+namespace ML {
 namespace Dbscan {
 namespace AdjGraph {
 
 template <typename Index_ = int>
 void run(const raft::handle_t& handle, bool* adj, Index_* vd, Index_* adj_graph,
-         Index_ adjnnz, Index_* ex_scan, Index_ N, Index_ minpts,
-         bool* core_pts, int algo, Index_ batchSize, cudaStream_t stream) {
-  Pack<Index_> data = {vd,      adj,      adj_graph, adjnnz,
-                       ex_scan, core_pts, N,         minpts};
+         Index_ adjnnz, Index_* ex_scan, Index_ N, int algo, Index_ batch_size,
+         cudaStream_t stream) {
+  Pack<Index_> data = {vd, adj, adj_graph, adjnnz, ex_scan, N};
   switch (algo) {
     case 0:
-      Naive::launcher<Index_>(handle, data, batchSize, stream);
+      Naive::launcher<Index_>(handle, data, batch_size, stream);
       break;
     case 1:
-      Algo::launcher<Index_>(handle, data, batchSize, stream);
+      Algo::launcher<Index_>(handle, data, batch_size, stream);
       break;
     default:
       ASSERT(false, "Incorrect algo passed! '%d'", algo);
@@ -44,3 +44,4 @@ void run(const raft::handle_t& handle, bool* adj, Index_* vd, Index_* adj_graph,
 
 }  // namespace AdjGraph
 }  // namespace Dbscan
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/corepoints/compute.cuh b/cpp/src/dbscan/corepoints/compute.cuh
new file mode 100644
index 0000000000..6de9e3c957
--- /dev/null
+++ b/cpp/src/dbscan/corepoints/compute.cuh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <common/cumlHandle.hpp>
+
+namespace ML {
+namespace Dbscan {
+namespace CorePoints {
+
+/**
+ * Compute the core points from the vertex degrees and min_pts criterion
+ * @param[in]  handle          cuML handle
+ * @param[in]  vd              Vertex degrees
+ * @param[out] mask            Boolean core point mask
+ * @param[in]  min_pts         Core point criterion
+ * @param[in]  start_vertex_id First point of the batch
+ * @param[in]  batch_size      Batch size
+ * @param[in]  stream          CUDA stream
+ */
+template <typename Index_ = int>
+void compute(const raft::handle_t& handle, const Index_* vd, bool* mask,
+             Index_ min_pts, Index_ start_vertex_id, Index_ batch_size,
+             cudaStream_t stream) {
+  auto execution_policy =
+    ML::thrust_exec_policy(handle.get_device_allocator(), stream);
+  auto counting = thrust::make_counting_iterator<Index_>(0);
+  thrust::for_each(execution_policy->on(stream), counting,
+                   counting + batch_size, [=] __device__(Index_ idx) {
+                     mask[idx + start_vertex_id] = vd[idx] >= min_pts;
+                   });
+}
+
+}  // namespace CorePoints
+}  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/corepoints/exchange.cuh b/cpp/src/dbscan/corepoints/exchange.cuh
new file mode 100644
index 0000000000..6a4883b1a1
--- /dev/null
+++ b/cpp/src/dbscan/corepoints/exchange.cuh
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <common/cumlHandle.hpp>
+
+namespace ML {
+namespace Dbscan {
+namespace CorePoints {
+
+/**
+ * Compute the core points from the vertex degrees and min_pts criterion
+ * @param[in]  handle    cuML handle
+ * @param[out] mask      Boolean core point mask
+ * @param[in]  N         Number of points
+ * @param[in]  start_row Offset for this node
+ * @param[in]  stream    CUDA stream
+ */
+template <typename Index_ = int>
+void exchange(const raft::handle_t& handle, bool* mask, Index_ N,
+              Index_ start_row, cudaStream_t stream) {
+  const auto& comm = handle.get_comms();
+  int my_rank = comm.get_rank();
+  int n_rank = comm.get_size();
+
+  // Array with the size of the contribution of each worker
+  Index_ rows_per_rank = raft::ceildiv<Index_>(N, n_rank);
+  std::vector<size_t> recvcounts = std::vector<size_t>(n_rank, rows_per_rank);
+  recvcounts[n_rank - 1] = N - (n_rank - 1) * rows_per_rank;
+
+  // Array with the displacement of each part
+  std::vector<size_t> displs = std::vector<size_t>(n_rank);
+  for (int i = 0; i < n_rank; i++) displs[i] = i * rows_per_rank;
+
+  // All-gather operation with variable contribution length
+  comm.allgatherv<char>((char*)mask + start_row, (char*)mask, recvcounts.data(),
+                        displs.data(), stream);
+  ASSERT(comm.sync_stream(stream) == raft::comms::status_t::SUCCESS,
+         "An error occurred in the distributed operation. This can result from "
+         "a failed rank");
+}
+
+}  // namespace CorePoints
+}  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu
index 7f8ae9e286..a70b31ae83 100644
--- a/cpp/src/dbscan/dbscan.cu
+++ b/cpp/src/dbscan/dbscan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,78 +18,63 @@
 #include <common/cumlHandle.hpp>
 #include <cuml/cluster/dbscan.hpp>
 #include "dbscan.cuh"
-#include "runner.cuh"
 
 namespace ML {
+namespace Dbscan {
 
-using namespace Dbscan;
-
-void dbscanFit(const raft::handle_t &handle, float *input, int n_rows,
-               int n_cols, float eps, int min_pts, int *labels,
-               size_t max_bytes_per_batch, int verbosity) {
-  dbscanFitImpl<float, int>(handle, input, n_rows, n_cols, eps, min_pts, labels,
-                            nullptr, max_bytes_per_batch, handle.get_stream(),
-                            verbosity);
-}
-
-void dbscanFit(const raft::handle_t &handle, double *input, int n_rows,
-               int n_cols, double eps, int min_pts, int *labels,
-               size_t max_bytes_per_batch, int verbosity) {
-  dbscanFitImpl<double, int>(handle, input, n_rows, n_cols, eps, min_pts,
-                             labels, nullptr, max_bytes_per_batch,
-                             handle.get_stream(), verbosity);
-}
-
-void dbscanFit(const raft::handle_t &handle, float *input, int n_rows,
-               int n_cols, float eps, int min_pts, int *labels,
-               int *core_sample_indices, size_t max_bytes_per_batch,
-               int verbosity) {
-  dbscanFitImpl<float, int>(handle, input, n_rows, n_cols, eps, min_pts, labels,
-                            core_sample_indices, max_bytes_per_batch,
-                            handle.get_stream(), verbosity);
-}
-
-void dbscanFit(const raft::handle_t &handle, double *input, int n_rows,
-               int n_cols, double eps, int min_pts, int *labels,
-               int *core_sample_indices, size_t max_bytes_per_batch,
-               int verbosity) {
-  dbscanFitImpl<double, int>(handle, input, n_rows, n_cols, eps, min_pts,
-                             labels, core_sample_indices, max_bytes_per_batch,
-                             handle.get_stream(), verbosity);
-}
-
-void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows,
-               int64_t n_cols, float eps, int min_pts, int64_t *labels,
-               size_t max_bytes_per_batch, int verbosity) {
-  dbscanFitImpl<float, int64_t>(handle, input, n_rows, n_cols, eps, min_pts,
-                                labels, nullptr, max_bytes_per_batch,
-                                handle.get_stream(), verbosity);
+void fit(const raft::handle_t &handle, float *input, int n_rows, int n_cols,
+         float eps, int min_pts, int *labels, int *core_sample_indices,
+         size_t max_bytes_per_batch, int verbosity, bool opg) {
+  if (opg)
+    dbscanFitImpl<float, int, true>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
+  else
+    dbscanFitImpl<float, int, false>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
 }
 
-void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows,
-               int64_t n_cols, double eps, int min_pts, int64_t *labels,
-               size_t max_bytes_per_batch, int verbosity) {
-  dbscanFitImpl<double, int64_t>(handle, input, n_rows, n_cols, eps, min_pts,
-                                 labels, nullptr, max_bytes_per_batch,
-                                 handle.get_stream(), verbosity);
+void fit(const raft::handle_t &handle, double *input, int n_rows, int n_cols,
+         double eps, int min_pts, int *labels, int *core_sample_indices,
+         size_t max_bytes_per_batch, int verbosity, bool opg) {
+  if (opg)
+    dbscanFitImpl<double, int, true>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
+  else
+    dbscanFitImpl<double, int, false>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
 }
 
-void dbscanFit(const raft::handle_t &handle, float *input, int64_t n_rows,
-               int64_t n_cols, float eps, int min_pts, int64_t *labels,
-               int64_t *core_sample_indices, size_t max_bytes_per_batch,
-               int verbosity) {
-  dbscanFitImpl<float, int64_t>(
-    handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
-    max_bytes_per_batch, handle.get_stream(), verbosity);
+void fit(const raft::handle_t &handle, float *input, int64_t n_rows,
+         int64_t n_cols, float eps, int min_pts, int64_t *labels,
+         int64_t *core_sample_indices, size_t max_bytes_per_batch,
+         int verbosity, bool opg) {
+  if (opg)
+    dbscanFitImpl<float, int64_t, true>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
+  else
+    dbscanFitImpl<float, int64_t, false>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
 }
 
-void dbscanFit(const raft::handle_t &handle, double *input, int64_t n_rows,
-               int64_t n_cols, double eps, int min_pts, int64_t *labels,
-               int64_t *core_sample_indices, size_t max_bytes_per_batch,
-               int verbosity) {
-  dbscanFitImpl<double, int64_t>(
-    handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
-    max_bytes_per_batch, handle.get_stream(), verbosity);
+void fit(const raft::handle_t &handle, double *input, int64_t n_rows,
+         int64_t n_cols, double eps, int min_pts, int64_t *labels,
+         int64_t *core_sample_indices, size_t max_bytes_per_batch,
+         int verbosity, bool opg) {
+  if (opg)
+    dbscanFitImpl<double, int64_t, true>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
+  else
+    dbscanFitImpl<double, int64_t, false>(
+      handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
+      max_bytes_per_batch, handle.get_stream(), verbosity);
 }
 
-};  // end namespace ML
+}  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh
index af6420f1d1..6c8b600e0f 100644
--- a/cpp/src/dbscan/dbscan.cuh
+++ b/cpp/src/dbscan/dbscan.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,14 +25,13 @@
 
 namespace ML {
 
-using namespace Dbscan;
 // Default max mem set to a reasonable value for a 16gb card.
 static const size_t DEFAULT_MAX_MEM_MBYTES = 13e3;
 
 template <typename Index_ = int>
-Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows,
-                         size_t max_mbytes_per_batch = 0,
-                         Index_ neigh_per_row = 0) {
+Index_ compute_batch_size(size_t &estimated_memory, Index_ n_rows,
+                          Index_ n_owned_rows, size_t max_mbytes_per_batch = 0,
+                          Index_ neigh_per_row = 0) {
   // In real applications, it's unlikely that the sparse adjacency matrix
   // comes even close to the worst-case memory usage, because if epsilon
   // is so large that all points are connected to 10% or even more of other
@@ -41,16 +40,21 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows,
 
   if (neigh_per_row <= 0) neigh_per_row = n_rows;
 
-  // we'll estimate the memory consumption per row.
-  // First the dense adjacency matrix
-  estimated_memory = n_rows * sizeof(bool);
-  // sparse adjacency matrix
-  estimated_memory += neigh_per_row * sizeof(Index_);
-  // core points and two indicator variables
-  estimated_memory += 3 * sizeof(bool);
-  // the rest will be so small that it should fit into what we have left over
+  /* Memory needed per batch row:
+   *  - Dense adj matrix: n_rows (bool)
+   *  - Sparse adj matrix: neigh_per_row (Index_)
+   *  - Vertex degrees: 1 (Index_)
+   *  - Ex scan: 1 (Index_)
+   */
+  size_t est_mem_per_row =
+    n_rows * sizeof(bool) + (neigh_per_row + 2) * sizeof(Index_);
+  /* Memory needed regardless of the batch size:
+   *  - Temporary labels: n_rows (Index_)
+   *  - Core point mask: n_rows (bool)
+   */
+  size_t est_mem_fixed = n_rows * (sizeof(Index_) + sizeof(bool));
+  // The rest will be so small that it should fit into what we have left over
   // from the over-estimation of the sparse adjacency matrix
-  estimated_memory *= n_rows;
 
   if (max_mbytes_per_batch <= 0) {
     /* using default here as in decision tree, waiting for mem info from device allocator
@@ -60,69 +64,93 @@ Index_ computeBatchCount(size_t &estimated_memory, Index_ n_rows,
     max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES;
   }
 
-  Index_ nBatches = (Index_)raft::ceildiv<size_t>(
-    estimated_memory, max_mbytes_per_batch * 1000000);
+  // Batch size determined based on available memory
+  Index_ batch_size =
+    (max_mbytes_per_batch * 1000000 - est_mem_fixed) / est_mem_per_row;
+
+  // Limit batch size to number of owned rows
+  batch_size = std::min(n_owned_rows, batch_size);
+
+  // To avoid overflow, we need: batch_size <= MAX_LABEL / n_rows (floor div)
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
-  // to avoid overflow, we need: batch_size <= MAX_LABEL / n_rows (floor div)
-  // -> num_batches >= raft::ceildiv(n_rows / (MAX_LABEL / n_rows))
-  Index_ nBatchesPrec = raft::ceildiv(n_rows, MAX_LABEL / n_rows);
-  // at some point, if nBatchesPrec is larger than nBatches
-  // (or larger by a given factor) and we know that there are clear
-  // performance benefits of using a smaller number of batches,
-  // we should probably warn the user.
-  // In the latest benchmarks, it seems like using int64 indexing and batches
-  // that are much larger than 2.10^9 points (the limit for int32), doesn't
-  // actually improve performance, even when using >16.10^9 points per batch.
-  // Much larger batches than 16.10^9 do not currently fit on GPU architectures
+  if (batch_size > MAX_LABEL / n_rows) {
+    Index_ new_batch_size = MAX_LABEL / n_rows;
+    CUML_LOG_WARN(
+      "Batch size limited by the chosen integer type (%d bytes). %d -> %d. "
+      "Using the larger integer type might result in better performance",
+      (int)sizeof(Index_), (int)batch_size, (int)new_batch_size);
+    batch_size = new_batch_size;
+  }
+
+  // Warn when a smaller index type could be used
   if (sizeof(Index_) > sizeof(int) &&
-      (size_t)n_rows * raft::ceildiv<Index_>(n_rows, nBatches) <
-        std::numeric_limits<int>::max()) {
+      batch_size < std::numeric_limits<int>::max() / n_rows) {
     CUML_LOG_WARN(
       "You are using an index type of size (%d bytes) but a smaller index "
-      "type (%d bytes) would be sufficient. Consider using the smaller "
-      "index type for better performance.",
+      "type (%d bytes) would be sufficient. Using the smaller integer type "
+      "might result in better performance.",
       (int)sizeof(Index_), (int)sizeof(int));
   }
-  if (nBatchesPrec > nBatches) {
-    nBatches = nBatchesPrec;
-    // we have to re-adjust memory estimation here
-    estimated_memory = nBatches * (estimated_memory / n_rows);
-  }
-  return max((Index_)1, nBatches);
+
+  estimated_memory = batch_size * est_mem_per_row + est_mem_fixed;
+  return batch_size;
 }
 
-template <typename T, typename Index_ = int>
+template <typename T, typename Index_ = int, bool opg = false>
 void dbscanFitImpl(const raft::handle_t &handle, T *input, Index_ n_rows,
                    Index_ n_cols, T eps, Index_ min_pts, Index_ *labels,
                    Index_ *core_sample_indices, size_t max_mbytes_per_batch,
                    cudaStream_t stream, int verbosity) {
   ML::PUSH_RANGE("ML::Dbscan::Fit");
   ML::Logger::get().setLevel(verbosity);
-  int algoVd = 1;
-  int algoAdj = 1;
-  int algoCcl = 2;
+  int algo_vd = 1;
+  int algo_adj = 1;
+  int algo_ccl = 2;
+
+  int my_rank, n_rank;
+  Index_ start_row, n_owned_rows;
+  if (opg) {
+    const auto &comm = handle.get_comms();
+    my_rank = comm.get_rank();
+    n_rank = comm.get_size();
+    Index_ rows_per_rank = raft::ceildiv<Index_>(n_rows, n_rank);
+    start_row = my_rank * rows_per_rank;
+    Index_ end_row = min((my_rank + 1) * rows_per_rank, n_rows);
+    n_owned_rows = max(Index_(0), end_row - start_row);
+    // Note: it is possible for a node to have no work in theory. It won't
+    // happen in practice (because n_rows is much greater than n_rank)
+  } else {
+    my_rank = 0;
+    n_rank = 1;
+    n_owned_rows = n_rows;
+  }
+
+  CUML_LOG_DEBUG("#%d owns %ld rows", (int)my_rank,
+                 (unsigned long)n_owned_rows);
 
-  ///@todo: Query device for remaining memory
+  /// TODO: Query device for remaining memory
   size_t estimated_memory;
-  Index_ n_batches =
-    computeBatchCount<Index_>(estimated_memory, n_rows, max_mbytes_per_batch);
+  Index_ batch_size = compute_batch_size<Index_>(
+    estimated_memory, n_rows, n_owned_rows, max_mbytes_per_batch);
 
-  if (n_batches > 1) {
-    CUML_LOG_DEBUG("Running batched training on %ld batches w/ %lf MB",
-                   (unsigned long)n_batches,
-                   (double)estimated_memory * 1e-6 / n_batches);
-  }
+  CUML_LOG_DEBUG(
+    "Running batched training (batch size: %ld, estimated: %lf MB)",
+    (unsigned long)batch_size, (double)estimated_memory * 1e-6);
+
+  size_t workspaceSize = Dbscan::run<T, Index_, opg>(
+    handle, input, n_rows, n_cols, start_row, n_owned_rows, eps, min_pts,
+    labels, core_sample_indices, algo_vd, algo_adj, algo_ccl, NULL, batch_size,
+    stream);
 
-  size_t workspaceSize = Dbscan::run(
-    handle, input, n_rows, n_cols, eps, min_pts, labels, core_sample_indices,
-    algoVd, algoAdj, algoCcl, NULL, n_batches, stream);
+  CUML_LOG_DEBUG("Workspace size: %lf MB", (double)workspaceSize * 1e-6);
 
   MLCommon::device_buffer<char> workspace(handle.get_device_allocator(), stream,
                                           workspaceSize);
-  Dbscan::run(handle, input, n_rows, n_cols, eps, min_pts, labels,
-              core_sample_indices, algoVd, algoAdj, algoCcl, workspace.data(),
-              n_batches, stream);
+  Dbscan::run<T, Index_, opg>(handle, input, n_rows, n_cols, start_row,
+                              n_owned_rows, eps, min_pts, labels,
+                              core_sample_indices, algo_vd, algo_adj, algo_ccl,
+                              workspace.data(), batch_size, stream);
   ML::POP_RANGE();
 }
 
-};  // namespace ML
+}  // namespace ML
diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp
index 15cb1bf684..a7570d3da5 100644
--- a/cpp/src/dbscan/dbscan_api.cpp
+++ b/cpp/src/dbscan/dbscan_api.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
   std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle);
   if (status == CUML_SUCCESS) {
     try {
-      ML::dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels,
-                    core_sample_indices, max_bytes_per_batch, verbosity);
+      ML::Dbscan::fit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels,
+                      core_sample_indices, max_bytes_per_batch, verbosity);
     }
     //TODO: Implement this
     //catch (const MLCommon::Exception& e)
@@ -53,8 +53,8 @@ cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
   std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle);
   if (status == CUML_SUCCESS) {
     try {
-      ML::dbscanFit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels,
-                    core_sample_indices, max_bytes_per_batch, verbosity);
+      ML::Dbscan::fit(*handle_ptr, input, n_rows, n_cols, eps, min_pts, labels,
+                      core_sample_indices, max_bytes_per_batch, verbosity);
     }
     //TODO: Implement this
     //catch (const MLCommon::Exception& e)
diff --git a/cpp/src/dbscan/dbscan_api.h b/cpp/src/dbscan/dbscan_api.h
index e27f41ba7d..e7877200a2 100644
--- a/cpp/src/dbscan/dbscan_api.h
+++ b/cpp/src/dbscan/dbscan_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/dbscan/mergelabels/runner.cuh b/cpp/src/dbscan/mergelabels/runner.cuh
new file mode 100644
index 0000000000..1fc60144c0
--- /dev/null
+++ b/cpp/src/dbscan/mergelabels/runner.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <common/cumlHandle.hpp>
+#include <label/merge_labels.cuh>
+
+namespace ML {
+namespace Dbscan {
+namespace MergeLabels {
+
+/**
+ * Merges to label arrays according to a given core point mask
+ * @param[in]    handle      raft handle
+ * @param[inout] labels_a    First input, and output label array (in-place)
+ * @param[in]    labels_b    Second input label array
+ * @param[in]    mask        Core point mask
+ * @param[in]    work_buffer Working buffer (for R)
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template <typename Index_ = int, int TPB_X = 256>
+void run(const raft::handle_t& handle, Index_* labels_a, const Index_* labels_b,
+         const bool* mask, Index_* work_buffer, bool* m, Index_ N,
+         cudaStream_t stream) {
+  MLCommon::Label::merge_labels<Index_, TPB_X>(labels_a, labels_b, mask,
+                                               work_buffer, m, N, stream);
+}
+
+}  // namespace MergeLabels
+}  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/mergelabels/tree_reduction.cuh b/cpp/src/dbscan/mergelabels/tree_reduction.cuh
new file mode 100644
index 0000000000..e81297bb21
--- /dev/null
+++ b/cpp/src/dbscan/mergelabels/tree_reduction.cuh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <common/cumlHandle.hpp>
+#include "runner.cuh"
+
+namespace ML {
+namespace Dbscan {
+namespace MergeLabels {
+
+/**
+ * Merges to label arrays according to a given core point mask
+ * @param[in]    handle      raft handle
+ * @param[inout] labels      Labels computed by this node
+ * @param[in]    labels_temp Buffer to receive labels from another node
+ * @param[in]    mask        Boolean core point mask
+ * @param[in]    work_buffer Working buffer (for R)
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template <typename Index_ = int>
+void tree_reduction(const raft::handle_t& handle, Index_* labels,
+                    Index_* labels_temp, const bool* mask, Index_* work_buffer,
+                    bool* m, Index_ N, cudaStream_t stream) {
+  const auto& comm = handle.get_comms();
+  int my_rank = comm.get_rank();
+  int n_rank = comm.get_size();
+  raft::comms::request_t request;
+
+  int s = 1;
+  while (s < n_rank) {
+    CUML_LOG_DEBUG("Tree reduction, s=", s);
+
+    // Find out whether the node is a receiver / sender / passive
+    bool receiver = my_rank % (2 * s) == 0 && my_rank + s < n_rank;
+    bool sender = my_rank % (2 * s) == s;
+
+    if (receiver) {
+      CUML_LOG_DEBUG("--> Receive labels (from %d)", my_rank + s);
+      comm.irecv(labels_temp, N, my_rank + s, 0, &request);
+    } else if (sender) {
+      CUML_LOG_DEBUG("--> Send labels (from %d)", my_rank - s);
+      comm.isend(labels, N, my_rank - s, 0, &request);
+    }
+
+    try {
+      comm.waitall(1, &request);
+    } catch (raft::exception& e) {
+      CUML_LOG_DEBUG("Communication failure");
+    }
+
+    if (receiver) {
+      CUML_LOG_DEBUG("--> Merge labels");
+      ML::PUSH_RANGE("Trace::Dbscan::MergeLabels");
+      MergeLabels::run<Index_>(handle, labels, labels_temp, mask, work_buffer,
+                               m, N, stream);
+      ML::POP_RANGE();
+    }
+
+    s *= 2;
+  }
+}
+
+}  // namespace MergeLabels
+}  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index 785b93897e..6e1cb00ce5 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,11 +24,15 @@
 #include <raft/cuda_utils.cuh>
 #include <sparse/csr.cuh>
 #include "adjgraph/runner.cuh"
+#include "corepoints/compute.cuh"
+#include "corepoints/exchange.cuh"
+#include "mergelabels/runner.cuh"
+#include "mergelabels/tree_reduction.cuh"
 #include "vertexdeg/runner.cuh"
 
-#include <sys/time.h>
 #include <cuml/common/logger.hpp>
 
+namespace ML {
 namespace Dbscan {
 
 using namespace MLCommon;
@@ -65,57 +69,77 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream,
     [MAX_LABEL] __device__(Index_ val) { return val == MAX_LABEL; }, allocator);
 }
 
-/* @param N number of points
- * @param D dimensionality of the points
- * @param eps epsilon neighborhood criterion
- * @param minPts core points criterion
- * @param labels the output labels (should be of size N)
- * @param ....
- * @param temp temporary global memory buffer used to store intermediate computations
- *             If this is a null pointer, then this function will return the workspace size needed.
- *             It is the responsibility of the user to cudaMalloc and cudaFree this buffer!
- * @param stream the cudaStream where to launch the kernels
- * @return in case the temp buffer is null, this returns the size needed.
+/**
+ * Run the DBSCAN algorithm (common code for single-GPU and multi-GPU)
+ * @tparam opg Whether we are running in a multi-node multi-GPU context
+ * @param[in]  handle       raft handle
+ * @param[in]  x            Input data (N*D row-major device array)
+ * @param[in]  N            Number of points
+ * @param[in]  D            Dimensionality of the points
+ * @param[in]  start_row    Index of the offset for this node
+ * @param[in]  n_owned_rows Number of rows (points) owned by this node
+ * @param[in]  eps          Epsilon neighborhood criterion
+ * @param[in]  min_pts      Core points criterion
+ * @param[out] labels       Output labels (device array of length N)
+ * @param[out] core_indices If not nullptr, the indices of core points are written in this array
+ * @param[in]  algo_vd      Algorithm used for the vertex degrees
+ * @param[in]  algo_adj     Algorithm used for the adjacency graph
+ * @param[in]  algo_ccl     Algorithm used for the final relabel
+ * @param[in]  workspace    Temporary global memory buffer used to store intermediate computations
+ *                          If nullptr, then this function will return the workspace size needed.
+ *                          It is the responsibility of the user to allocate and free this buffer!
+ * @param[in]  batch_size   Batch size
+ * @param[in]  stream       The CUDA stream where to launch the kernels
+ * @return In case the workspace pointer is null, this returns the size needed.
  */
-template <typename Type_f, typename Index_ = int>
-size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
-           Type_f eps, Index_ minPts, Index_* labels,
-           Index_* core_sample_indices, int algoVd, int algoAdj, int algoCcl,
-           void* workspace, Index_ nBatches, cudaStream_t stream) {
+template <typename Type_f, typename Index_ = int, bool opg = false>
+size_t run(const raft::handle_t& handle, const Type_f* x, Index_ N, Index_ D,
+           Index_ start_row, Index_ n_owned_rows, Type_f eps, Index_ min_pts,
+           Index_* labels, Index_* core_indices, int algo_vd, int algo_adj,
+           int algo_ccl, void* workspace, Index_ batch_size,
+           cudaStream_t stream) {
   const size_t align = 256;
-  size_t batchSize = raft::ceildiv<size_t>(N, nBatches);
+  Index_ n_batches = raft::ceildiv<Index_>(n_owned_rows, batch_size);
+
+  int my_rank;
+  if (opg) {
+    const auto& comm = handle.get_comms();
+    my_rank = comm.get_rank();
+  } else
+    my_rank = 0;
 
   /**
    * Note on coupling between data types:
-   * - adjacency graph has a worst case size of N * batchSize elements. Thus,
+   * - adjacency graph has a worst case size of N * batch_size elements. Thus,
    * if N is very close to being greater than the maximum 32-bit IdxType type used, a
    * 64-bit IdxType should probably be used instead.
    * - exclusive scan is the CSR row index for the adjacency graph and its values have a
-   * risk of overflowing when N * batchSize becomes larger what can be stored in IdxType
+   * risk of overflowing when N * batch_size becomes larger what can be stored in IdxType
    * - the vertex degree array has a worst case of each element having all other
    * elements in their neighborhood, so any IdxType can be safely used, so long as N doesn't
    * overflow.
    */
-  size_t adjSize = raft::alignTo<size_t>(sizeof(bool) * N * batchSize, align);
-  size_t corePtsSize = raft::alignTo<size_t>(sizeof(bool) * N, align);
-  size_t xaSize = raft::alignTo<size_t>(sizeof(bool) * N, align);
-  size_t mSize = raft::alignTo<size_t>(sizeof(bool), align);
-  size_t vdSize =
-    raft::alignTo<size_t>(sizeof(Index_) * (batchSize + 1), align);
-  size_t exScanSize = raft::alignTo<size_t>(sizeof(Index_) * batchSize, align);
+  size_t adj_size = raft::alignTo<size_t>(sizeof(bool) * N * batch_size, align);
+  size_t core_pts_size = raft::alignTo<size_t>(sizeof(bool) * N, align);
+  size_t m_size = raft::alignTo<size_t>(sizeof(bool), align);
+  size_t vd_size =
+    raft::alignTo<size_t>(sizeof(Index_) * (batch_size + 1), align);
+  size_t ex_scan_size =
+    raft::alignTo<size_t>(sizeof(Index_) * batch_size, align);
+  size_t labels_size = raft::alignTo<size_t>(sizeof(Index_) * N, align);
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
 
   ASSERT(
-    N * batchSize < MAX_LABEL,
+    N * batch_size < MAX_LABEL,
     "An overflow occurred with the current choice of precision "
     "and the number of samples. (Max allowed batch size is %ld, but was %ld). "
     "Consider using double precision for the output labels.",
-    (unsigned long)(MAX_LABEL / N), (unsigned long)batchSize);
+    (unsigned long)(MAX_LABEL / N), (unsigned long)batch_size);
 
   if (workspace == NULL) {
-    auto size =
-      adjSize + corePtsSize + 2 * xaSize + mSize + vdSize + exScanSize;
+    auto size = adj_size + core_pts_size + m_size + vd_size + ex_scan_size +
+                2 * labels_size;
     return size;
   }
 
@@ -125,121 +149,153 @@ size_t run(const raft::handle_t& handle, Type_f* x, Index_ N, Index_ D,
   Index_ curradjlen = 0;
   char* temp = (char*)workspace;
   bool* adj = (bool*)temp;
-  temp += adjSize;
+  temp += adj_size;
   bool* core_pts = (bool*)temp;
-  temp += corePtsSize;
-  bool* xa = (bool*)temp;
-  temp += xaSize;
-  bool* fa = (bool*)temp;
-  temp += xaSize;
+  temp += core_pts_size;
   bool* m = (bool*)temp;
-  temp += mSize;
+  temp += m_size;
   Index_* vd = (Index_*)temp;
-  temp += vdSize;
+  temp += vd_size;
   Index_* ex_scan = (Index_*)temp;
-  temp += exScanSize;
+  temp += ex_scan_size;
+  Index_* labels_temp = (Index_*)temp;
+  temp += labels_size;
+  Index_* work_buffer = (Index_*)temp;
+  temp += labels_size;
+
+  // Compute the mask
+  // 1. Compute the part owned by this worker (reversed order of batches to
+  // keep the batch 0 in memory)
+  for (int i = n_batches - 1; i >= 0; i--) {
+    Index_ start_vertex_id = start_row + i * batch_size;
+    Index_ n_points = min(n_owned_rows - i * batch_size, batch_size);
+
+    CUML_LOG_DEBUG("- Batch %d / %ld (%ld samples)", i + 1,
+                   (unsigned long)n_batches, (unsigned long)n_points);
 
-  // Running VertexDeg
-  raft::sparse::WeakCCState state(xa, fa, m);
-  MLCommon::device_buffer<Index_> adj_graph(handle.get_device_allocator(),
-                                            stream);
-
-  for (int i = 0; i < nBatches; i++) {
+    CUML_LOG_DEBUG("--> Computing vertex degrees");
     ML::PUSH_RANGE("Trace::Dbscan::VertexDeg");
+    VertexDeg::run<Type_f, Index_>(handle, adj, vd, x, eps, N, D, algo_vd,
+                                   start_vertex_id, n_points, stream);
+    ML::POP_RANGE();
 
-    Index_ startVertexId = i * batchSize;
-    Index_ nPoints = min(size_t(N - startVertexId), batchSize);
-    if (nPoints <= 0) continue;
-
-    CUML_LOG_DEBUG("- Iteration %d / %ld. Batch size is %ld samples", i + 1,
-                   (unsigned long)nBatches, (unsigned long)nPoints);
+    CUML_LOG_DEBUG("--> Computing core point mask");
+    ML::PUSH_RANGE("Trace::Dbscan::CorePoints");
+    CorePoints::compute<Index_>(handle, vd, core_pts, min_pts, start_vertex_id,
+                                n_points, stream);
+    ML::POP_RANGE();
+  }
+  // 2. Exchange with the other workers
+  if (opg) CorePoints::exchange(handle, core_pts, N, start_row, stream);
 
-    int64_t start_time = raft::curTimeMillis();
+  // Compute the labelling for the owned part of the graph
+  raft::sparse::WeakCCState state(m);
+  MLCommon::device_buffer<Index_> adj_graph(handle.get_device_allocator(),
+                                            stream);
 
-    CUML_LOG_DEBUG("--> Computing vertex degrees");
-    VertexDeg::run<Type_f, Index_>(handle, adj, vd, x, eps, N, D, algoVd,
-                                   startVertexId, nPoints, stream);
-    raft::update_host(&curradjlen, vd + nPoints, 1, stream);
+  for (int i = 0; i < n_batches; i++) {
+    Index_ start_vertex_id = start_row + i * batch_size;
+    Index_ n_points = min(n_owned_rows - i * batch_size, batch_size);
+    if (n_points <= 0) break;
+
+    CUML_LOG_DEBUG("- Batch %d / %ld (%ld samples)", i + 1,
+                   (unsigned long)n_batches, (unsigned long)n_points);
+
+    // i==0 -> adj and vd for batch 0 already in memory
+    if (i > 0) {
+      CUML_LOG_DEBUG("--> Computing vertex degrees");
+      ML::PUSH_RANGE("Trace::Dbscan::VertexDeg");
+      VertexDeg::run<Type_f, Index_>(handle, adj, vd, x, eps, N, D, algo_vd,
+                                     start_vertex_id, n_points, stream);
+      ML::POP_RANGE();
+    }
+    raft::update_host(&curradjlen, vd + n_points, 1, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    ML::POP_RANGE();
-
-    int64_t cur_time = raft::curTimeMillis();
-    CUML_LOG_DEBUG("    |-> Took %ld ms", (cur_time - start_time));
 
     CUML_LOG_DEBUG("--> Computing adjacency graph of size %ld samples.",
                    (unsigned long)curradjlen);
-    start_time = raft::curTimeMillis();
-    // Running AdjGraph
     ML::PUSH_RANGE("Trace::Dbscan::AdjGraph");
     if (curradjlen > maxadjlen || adj_graph.data() == NULL) {
       maxadjlen = curradjlen;
       adj_graph.resize(maxadjlen, stream);
     }
-
     AdjGraph::run<Index_>(handle, adj, vd, adj_graph.data(), curradjlen,
-                          ex_scan, N, minPts, core_pts + startVertexId, algoAdj,
-                          nPoints, stream);
-
+                          ex_scan, N, algo_adj, n_points, stream);
     ML::POP_RANGE();
 
-    ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
-
-    cur_time = raft::curTimeMillis();
-    CUML_LOG_DEBUG("    |-> Took %ld ms.", (cur_time - start_time));
-
     CUML_LOG_DEBUG("--> Computing connected components");
-
-    start_time = raft::curTimeMillis();
+    ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
     raft::sparse::weak_cc_batched<Index_, 1024>(
-      labels, ex_scan, adj_graph.data(), curradjlen, N, startVertexId, nPoints,
-      &state, stream,
-      [core_pts, startVertexId, nPoints] __device__(Index_ global_id) {
-        return global_id < startVertexId + nPoints ? core_pts[global_id]
-                                                   : false;
+      i == 0 ? labels : labels_temp, ex_scan, adj_graph.data(), curradjlen, N,
+      start_vertex_id, n_points, &state, stream,
+      [core_pts, N] __device__(Index_ global_id) {
+        return global_id < N ? __ldg((char*)core_pts + global_id) : 0;
       });
     ML::POP_RANGE();
 
-    cur_time = raft::curTimeMillis();
-    CUML_LOG_DEBUG("    |-> Took %ld ms.", (cur_time - start_time));
+    if (i > 0) {
+      // The labels_temp array contains the labelling for the neighborhood
+      // graph of the current batch. This needs to be merged with the labelling
+      // created by the previous batches.
+      // Using the labelling from the previous batches as initial value for
+      // weak_cc_batched and skipping the merge step would lead to incorrect
+      // results as described in #3094.
+      CUML_LOG_DEBUG("--> Accumulating labels");
+      ML::PUSH_RANGE("Trace::Dbscan::MergeLabels");
+      MergeLabels::run<Index_>(handle, labels, labels_temp, core_pts,
+                               work_buffer, m, N, stream);
+      ML::POP_RANGE();
+    }
   }
 
-  ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel");
-  if (algoCcl == 2)
-    final_relabel(labels, N, stream, handle.get_device_allocator());
-  size_t nblks = raft::ceildiv<size_t>(N, TPB);
-  relabelForSkl<Index_><<<nblks, TPB, 0, stream>>>(labels, N, MAX_LABEL);
-  CUDA_CHECK(cudaPeekAtLastError());
-  ML::POP_RANGE();
+  // Combine the results in the multi-node multi-GPU case
+  if (opg)
+    MergeLabels::tree_reduction(handle, labels, labels_temp, core_pts,
+                                work_buffer, m, N, stream);
+
+  /// TODO: optional minimalization step for border points
+
+  // Final relabel
+  if (my_rank == 0) {
+    ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel");
+    if (algo_ccl == 2)
+      final_relabel(labels, N, stream, handle.get_device_allocator());
+    size_t nblks = raft::ceildiv<size_t>(N, TPB);
+    relabelForSkl<Index_><<<nblks, TPB, 0, stream>>>(labels, N, MAX_LABEL);
+    CUDA_CHECK(cudaPeekAtLastError());
+    ML::POP_RANGE();
 
-  // Calculate the core_sample_indices only if an array was passed in
-  if (core_sample_indices != nullptr) {
-    ML::PUSH_RANGE("Trace::Dbscan::CoreSampleIndices");
+    // Calculate the core_indices only if an array was passed in
+    if (core_indices != nullptr) {
+      ML::PUSH_RANGE("Trace::Dbscan::CoreSampleIndices");
 
-    // Create the execution policy
-    ML::thrustAllocatorAdapter alloc(handle.get_device_allocator(), stream);
-    auto thrust_exec_policy = thrust::cuda::par(alloc).on(stream);
+      // Create the execution policy
+      ML::thrustAllocatorAdapter alloc(handle.get_device_allocator(), stream);
+      auto thrust_exec_policy = thrust::cuda::par(alloc).on(stream);
 
-    // Get wrappers for the device ptrs
-    thrust::device_ptr<bool> dev_core_pts =
-      thrust::device_pointer_cast(core_pts);
-    thrust::device_ptr<Index_> dev_core_sample_indices =
-      thrust::device_pointer_cast(core_sample_indices);
+      // Get wrappers for the device ptrs
+      thrust::device_ptr<bool> dev_core_pts =
+        thrust::device_pointer_cast(core_pts);
+      thrust::device_ptr<Index_> dev_core_indices =
+        thrust::device_pointer_cast(core_indices);
 
-    // First fill the core_sample_indices with -1 which will be used if core_point_count < N
-    thrust::fill_n(thrust_exec_policy, dev_core_sample_indices, N, (Index_)-1);
+      // First fill the core_indices with -1 which will be used if core_point_count < N
+      thrust::fill_n(thrust_exec_policy, dev_core_indices, N, (Index_)-1);
 
-    auto index_iterator = thrust::counting_iterator<int>(0);
+      auto index_iterator = thrust::counting_iterator<Index_>(0);
 
-    //Perform stream reduction on the core points. The core_pts acts as the stencil and we use thrust::counting_iterator to return the index
-    auto core_point_count = thrust::copy_if(
-      thrust_exec_policy, index_iterator, index_iterator + N, dev_core_pts,
-      dev_core_sample_indices,
-      [=] __device__(const bool is_core_point) { return is_core_point; });
+      //Perform stream reduction on the core points. The core_pts acts as the stencil and we use thrust::counting_iterator to return the index
+      auto core_point_count = thrust::copy_if(
+        thrust_exec_policy, index_iterator, index_iterator + N, dev_core_pts,
+        dev_core_indices,
+        [=] __device__(const bool is_core_point) { return is_core_point; });
 
-    ML::POP_RANGE();
+      ML::POP_RANGE();
+    }
   }
 
   CUML_LOG_DEBUG("Done.");
   return (size_t)0;
 }
 }  // namespace Dbscan
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/vertexdeg/algo.cuh b/cpp/src/dbscan/vertexdeg/algo.cuh
index 9b9c488fda..b19df57243 100644
--- a/cpp/src/dbscan/vertexdeg/algo.cuh
+++ b/cpp/src/dbscan/vertexdeg/algo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include "pack.h"
 
+namespace ML {
 namespace Dbscan {
 namespace VertexDeg {
 namespace Algo {
@@ -33,22 +34,24 @@ namespace Algo {
  */
 template <typename value_t, typename index_t = int>
 void launcher(const raft::handle_t &handle, Pack<value_t, index_t> data,
-              index_t startVertexId, index_t batchSize, cudaStream_t stream) {
-  data.resetArray(stream, batchSize + 1);
+              index_t start_vertex_id, index_t batch_size,
+              cudaStream_t stream) {
+  data.resetArray(stream, batch_size + 1);
 
   ASSERT(sizeof(index_t) == 4 || sizeof(index_t) == 8,
          "index_t should be 4 or 8 bytes");
 
   index_t m = data.N;
-  index_t n = min(data.N - startVertexId, batchSize);
+  index_t n = min(data.N - start_vertex_id, batch_size);
   index_t k = data.D;
   value_t eps2 = data.eps * data.eps;
 
   MLCommon::Distance::epsUnexpL2SqNeighborhood<value_t, index_t>(
-    data.adj, data.vd, data.x, data.x + startVertexId * k, m, n, k, eps2,
+    data.adj, data.vd, data.x, data.x + start_vertex_id * k, m, n, k, eps2,
     stream);
 }
 
 }  // namespace Algo
 }  // end namespace VertexDeg
-};  // end namespace Dbscan
+}  // end namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/vertexdeg/naive.cuh b/cpp/src/dbscan/vertexdeg/naive.cuh
index db239711e9..04b3ddbb79 100644
--- a/cpp/src/dbscan/vertexdeg/naive.cuh
+++ b/cpp/src/dbscan/vertexdeg/naive.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <raft/cuda_utils.cuh>
 #include "pack.h"
 
+namespace ML {
 namespace Dbscan {
 namespace VertexDeg {
 namespace Naive {
@@ -34,58 +35,61 @@ static const int TPB_Y = 8;
 /**
  * @brief Naive distance matrix evaluation and epsilon neighborhood construction
  * @param data input struct containing vertex degree computation params
- * @param startVertexId which vertex to begin the computations from
- * @param batchSize number of vertices in this batch
+ * @param start_vertex_id which vertex to begin the computations from
+ * @param batch_size number of vertices in this batch
  */
 template <typename Type, typename Index_ = int>
 __global__ void vertex_degree_kernel(Pack<Type, Index_> data,
-                                     Index_ startVertexId, Index_ batchSize) {
+                                     Index_ start_vertex_id,
+                                     Index_ batch_size) {
   const Type Zero = (Type)0;
   Index_ row = (blockIdx.y * TPB_Y) + threadIdx.y;
   Index_ col = (blockIdx.x * TPB_X) + threadIdx.x;
   Index_ N = data.N;
-  if ((row >= batchSize) || (col >= N)) return;
+  if ((row >= batch_size) || (col >= N)) return;
   Type eps = data.eps;
   Type eps2 = eps * eps;
   Type sum = Zero;
   Index_ D = data.D;
-  Type *x = data.x;
+  const Type *x = data.x;
   bool *adj = data.adj;
   Index_ *vd = data.vd;
   for (Index_ d = 0; d < D; ++d) {
-    Type a = __ldg(x + (row + startVertexId) * D + d);
+    Type a = __ldg(x + (row + start_vertex_id) * D + d);
     Type b = __ldg(x + col * D + d);
     Type diff = a - b;
     sum += (diff * diff);
   }
   Index_ res = (sum <= eps2);
   adj[row * N + col] = res;
+  /// TODO: change layout or remove; cf #3414
 
   if (sizeof(Index_) == 4) {
     raft::myAtomicAdd((int *)(vd + row), (int)res);
-    raft::myAtomicAdd((int *)(vd + batchSize), (int)res);
+    raft::myAtomicAdd((int *)(vd + batch_size), (int)res);
   } else if (sizeof(Index_) == 8) {
     raft::myAtomicAdd<unsigned long long>((unsigned long long *)(vd + row),
                                           res);
     raft::myAtomicAdd<unsigned long long>(
-      (unsigned long long *)(vd + batchSize), res);
+      (unsigned long long *)(vd + batch_size), res);
   }
 }
 
 template <typename Type, typename Index_ = int>
-void launcher(Pack<Type, Index_> data, Index_ startVertexId, Index_ batchSize,
-              cudaStream_t stream) {
+void launcher(Pack<Type, Index_> data, Index_ start_vertex_id,
+              Index_ batch_size, cudaStream_t stream) {
   ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
          "index_t should be 4 or 8 bytes");
 
   dim3 grid(raft::ceildiv(data.N, (Index_)TPB_X),
-            raft::ceildiv(batchSize, (Index_)TPB_Y), 1);
+            raft::ceildiv(batch_size, (Index_)TPB_Y), 1);
   dim3 blk(TPB_X, TPB_Y, 1);
-  data.resetArray(stream, batchSize + 1);
-  vertex_degree_kernel<<<grid, blk, 0, stream>>>(data, startVertexId,
-                                                 batchSize);
+  data.resetArray(stream, batch_size + 1);
+  vertex_degree_kernel<<<grid, blk, 0, stream>>>(data, start_vertex_id,
+                                                 batch_size);
 }
 
 }  // namespace Naive
 }  // namespace VertexDeg
 }  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src/dbscan/vertexdeg/pack.h b/cpp/src/dbscan/vertexdeg/pack.h
index 5c0ddfcf52..eea232f3a8 100644
--- a/cpp/src/dbscan/vertexdeg/pack.h
+++ b/cpp/src/dbscan/vertexdeg/pack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+namespace ML {
 namespace Dbscan {
 namespace VertexDeg {
 
@@ -30,7 +31,7 @@ struct Pack {
   /** the adjacency matrix */
   bool *adj;
   /** input dataset */
-  Type *x;
+  const Type *x;
   /** epsilon neighborhood thresholding param */
   Type eps;
   /** number of points in the dataset */
@@ -50,3 +51,4 @@ struct Pack {
 
 }  // namespace VertexDeg
 }  // namespace Dbscan
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/vertexdeg/runner.cuh b/cpp/src/dbscan/vertexdeg/runner.cuh
index f3e05108e9..889dedd51c 100644
--- a/cpp/src/dbscan/vertexdeg/runner.cuh
+++ b/cpp/src/dbscan/vertexdeg/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,20 +21,22 @@
 #include "naive.cuh"
 #include "pack.h"
 
+namespace ML {
 namespace Dbscan {
 namespace VertexDeg {
 
 template <typename Type_f, typename Index_ = int>
-void run(const raft::handle_t& handle, bool* adj, Index_* vd, Type_f* x,
-         Type_f eps, Index_ N, Index_ D, int algo, Index_ startVertexId,
-         Index_ batchSize, cudaStream_t stream) {
+void run(const raft::handle_t& handle, bool* adj, Index_* vd, const Type_f* x,
+         Type_f eps, Index_ N, Index_ D, int algo, Index_ start_vertex_id,
+         Index_ batch_size, cudaStream_t stream) {
   Pack<Type_f, Index_> data = {vd, adj, x, eps, N, D};
   switch (algo) {
     case 0:
-      Naive::launcher<Type_f, Index_>(data, startVertexId, batchSize, stream);
+      Naive::launcher<Type_f, Index_>(data, start_vertex_id, batch_size,
+                                      stream);
       break;
     case 1:
-      Algo::launcher<Type_f, Index_>(handle, data, startVertexId, batchSize,
+      Algo::launcher<Type_f, Index_>(handle, data, start_vertex_id, batch_size,
                                      stream);
       break;
     default:
@@ -44,3 +46,4 @@ void run(const raft::handle_t& handle, bool* adj, Index_* vd, Type_f* x,
 
 }  // namespace VertexDeg
 }  // namespace Dbscan
+}  // namespace ML
diff --git a/cpp/src_prims/label/merge_labels.cuh b/cpp/src_prims/label/merge_labels.cuh
new file mode 100644
index 0000000000..608c0ab355
--- /dev/null
+++ b/cpp/src_prims/label/merge_labels.cuh
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <math.h>
+#include <limits>
+
+#include <linalg/init.h>
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+
+namespace MLCommon {
+namespace Label {
+
+/** Note: this is one possible implementation where we represent the label
+ *  equivalence graph implicitly using labels_a, labels_b and mask.
+ *  For an additional cost we can build the graph with edges
+ *  E={(A[i], B[i]) | M[i]=1} and make this step faster */
+template <typename Index_, int TPB_X = 256>
+__global__ void __launch_bounds__(TPB_X)
+  propagate_label_kernel(const Index_* __restrict__ labels_a,
+                         const Index_* __restrict__ labels_b,
+                         Index_* __restrict__ R, const bool* __restrict__ mask,
+                         bool* __restrict__ m, Index_ N) {
+  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+  if (tid < N) {
+    if (__ldg((char*)mask + tid)) {
+      // Note: labels are from 1 to N
+      Index_ la = __ldg(labels_a + tid) - 1;
+      Index_ lb = __ldg(labels_b + tid) - 1;
+      Index_ ra = R[la];
+      Index_ rb = R[lb];
+      if (ra != rb) {
+        *m = true;
+        // min(ra, rb) would be sufficient but this speeds up convergence
+        Index_ rmin = R[min(ra, rb)];
+        if (sizeof(Index_) == 4) {
+          atomicMin((int*)(R + la), rmin);
+          atomicMin((int*)(R + lb), rmin);
+        } else if (sizeof(Index_) == 8) {
+          atomicMin((long long int*)(R + la), rmin);
+          atomicMin((long long int*)(R + lb), rmin);
+        }
+      }
+    }
+  }
+}
+
+template <typename Index_, int TPB_X = 256>
+__global__ void __launch_bounds__(TPB_X)
+  reassign_label_kernel(Index_* __restrict__ labels_a,
+                        const Index_* __restrict__ labels_b,
+                        const Index_* __restrict__ R, Index_ N,
+                        Index_ MAX_LABEL) {
+  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+  if (tid < N) {
+    // Note: labels are from 1 to N
+    Index_ la = labels_a[tid];
+    Index_ lb = __ldg(labels_b + tid);
+    Index_ ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
+    Index_ rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
+    labels_a[tid] = min(ra, rb);
+  }
+}
+
+/**
+ * @brief Merge two labellings in-place, according to a core mask
+ *
+ * A labelling is a representation of disjoint sets (groups) where points that
+ * belong to the same group have the same label. It is assumed that group
+ * labels take values between 1 and N. Labels relate to points, i.e a label i+1
+ * means that you belong to the same group as the point i.
+ * The special value MAX_LABEL is used to mark points that are not labelled.
+ *
+ * The two label arrays A and B induce two sets of groups over points 0..N-1.
+ * If a point is labelled i in A and j in B and the mask is true for this
+ * point, then i and j are equivalent labels and their groups are merged by
+ * relabeling the elements of both groups to have the same label. The new label
+ * is the smaller one from the original labels.
+ * It is required that if the mask is true for a point, this point is labelled
+ * (i.e its label is different than the special value MAX_LABEL).
+ *
+ * One use case is finding connected components: the two input label arrays can
+ * represent the connected components of graphs G_A and G_B, and the output
+ * would be the connected components labels of G_A \union G_B.
+ *
+ * @param[inout] labels_a    First input, and output label array (in-place)
+ * @param[in]    labels_b    Second input label array
+ * @param[in]    mask        Core point mask
+ * @param[out]   R           Label equivalence map
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template <typename Index_ = int, int TPB_X = 256>
+void merge_labels(Index_* labels_a, const Index_* labels_b, const bool* mask,
+                  Index_* R, bool* m, Index_ N, cudaStream_t stream) {
+  dim3 blocks(raft::ceildiv(N, Index_(TPB_X)));
+  dim3 threads(TPB_X);
+  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
+
+  // Initialize R. R defines the relabeling rules; after merging the input
+  // arrays, label l will be reassigned as R[l-1]+1.
+  MLCommon::LinAlg::range(R, N, stream);
+
+  // We define the label equivalence graph: G = (V, E), where:
+  //  - V is the set of unique values from labels_a and labels_b
+  //  - E = {(labels_a[k], labels_b[k]) | mask[k] == true and k \in 0..n-1 }
+  // The edges connect groups from the two labellings. Only points with true
+  // mask can induce connection between groups.
+
+  // Step 1: compute connected components in the label equivalence graph
+  bool host_m;
+  do {
+    CUDA_CHECK(cudaMemsetAsync(m, false, sizeof(bool), stream));
+
+    propagate_label_kernel<Index_, TPB_X>
+      <<<blocks, threads, 0, stream>>>(labels_a, labels_b, R, mask, m, N);
+    CUDA_CHECK(cudaPeekAtLastError());
+
+    raft::update_host(&host_m, m, 1, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  } while (host_m);
+
+  // Step 2: re-assign minimum equivalent label
+  reassign_label_kernel<Index_, TPB_X>
+    <<<blocks, threads, 0, stream>>>(labels_a, labels_b, R, N, MAX_LABEL);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // namespace Label
+};  // namespace MLCommon
\ No newline at end of file
diff --git a/cpp/src_prims/sparse/csr.cuh b/cpp/src_prims/sparse/csr.cuh
index f584c4b0b7..369ff397d8 100644
--- a/cpp/src_prims/sparse/csr.cuh
+++ b/cpp/src_prims/sparse/csr.cuh
@@ -42,153 +42,83 @@ namespace sparse {
 
 struct WeakCCState {
  public:
-  bool *xa;
-  bool *fa;
   bool *m;
-
-  WeakCCState(bool *xa, bool *fa, bool *m) : xa(xa), fa(fa), m(m) {}
+  WeakCCState(bool *m) : m(m) {}
 };
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind,
-                                     const Index_ *row_ind_ptr, Index_ nnz,
-                                     bool *fa, bool *xa, bool *m,
-                                     Index_ startVertexId, Index_ batchSize,
+__global__ void weak_cc_label_device(Index_ *__restrict__ labels,
+                                     const Index_ *__restrict__ row_ind,
+                                     const Index_ *__restrict__ row_ind_ptr,
+                                     Index_ nnz, bool *__restrict__ m,
+                                     Index_ start_vertex_id, Index_ batch_size,
                                      Index_ N, Lambda filter_op) {
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
-  Index_ global_id = tid + startVertexId;
-  if (tid < batchSize && global_id < N) {
-    if (fa[global_id]) {
-      fa[global_id] = false;
-      Index_ row_ind_val = row_ind[tid];
-
-      Index_ start = row_ind_val;
-      Index_ ci, cj;
-      bool ci_mod = false;
-      ci = labels[global_id];
-      bool ci_allow_prop = filter_op(global_id);
-
-      Index_ degree = get_stop_idx(tid, batchSize, nnz, row_ind) - row_ind_val;
-      for (Index_ j = 0; j < degree;
-           j++) {  // TODO: Can't this be calculated from the ex_scan?
-        Index_ j_ind = row_ind_ptr[start + j];
-        cj = labels[j_ind];
-        bool cj_allow_prop = filter_op(j_ind);
-        if (ci < cj && ci_allow_prop) {
-          if (sizeof(Index_) == 4)
-            atomicMin((int *)(labels + j_ind), ci);
-          else if (sizeof(Index_) == 8)
-            atomicMin((long long int *)(labels + j_ind), ci);
-          ///@todo see https://github.com/rapidsai/cuml/issues/2306.
-          // It may be worth it to use an atomic op here such as
-          // atomicLogicalOr(xa + j_ind, cj_allow_prop);
-          // Same can be done for m : atomicLogicalOr(m, cj_allow_prop);
-          // Both can be done below for xa[global_id] with ci_allow_prop, too.
-          xa[j_ind] = true;
-          m[0] = true;
-        } else if (ci > cj && cj_allow_prop) {
-          ci = cj;
-          ci_mod = true;
-        }
-      }
-      if (ci_mod) {
+  Index_ global_id = tid + start_vertex_id;
+  if (tid < batch_size && global_id < N) {
+    Index_ start = __ldg(row_ind + tid);
+
+    Index_ ci, cj;
+    bool ci_mod = false;
+    ci = labels[global_id];
+    bool ci_allow_prop = filter_op(global_id);
+
+    Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
+    /// TODO: add one element to row_ind and avoid get_stop_idx
+    for (Index_ j = start; j < end; j++) {
+      Index_ j_ind = __ldg(row_ind_ptr + j);
+      cj = labels[j_ind];
+      bool cj_allow_prop = filter_op(j_ind);
+      if (ci < cj && ci_allow_prop) {
         if (sizeof(Index_) == 4)
-          atomicMin((int *)(labels + global_id), ci);
+          atomicMin((int *)(labels + j_ind), ci);
         else if (sizeof(Index_) == 8)
-          atomicMin((long long int *)(labels + global_id), ci);
-        xa[global_id] = true;
-        m[0] = true;
+          atomicMin((long long int *)(labels + j_ind), ci);
+        if (cj_allow_prop) *m = true;
+      } else if (ci > cj && cj_allow_prop) {
+        ci = cj;
+        ci_mod = true;
       }
     }
+    if (ci_mod) {
+      if (sizeof(Index_) == 4)
+        atomicMin((int *)(labels + global_id), ci);
+      else if (sizeof(Index_) == 8)
+        atomicMin((long long int *)(labels + global_id), ci);
+      if (ci_allow_prop) *m = true;
+    }
   }
 }
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_label_kernel(Index_ *labels, Index_ startVertexId,
-                                          Index_ batchSize, Index_ MAX_LABEL,
-                                          Lambda filter_op) {
-  /** F1 and F2 in the paper correspond to fa and xa */
-  /** Cd in paper corresponds to db_cluster */
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
-  if (tid < batchSize) {
-    Index_ global_id = tid + startVertexId;
-    if (filter_op(global_id) && labels[global_id] == MAX_LABEL)
-      labels[global_id] = global_id + 1;
-  }
-}
-
-template <typename Index_, int TPB_X = 256>
-__global__ void weak_cc_init_all_kernel(Index_ *labels, bool *fa, bool *xa,
-                                        Index_ N, Index_ MAX_LABEL) {
+__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
+                                        Index_ MAX_LABEL, Lambda filter_op) {
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
-    labels[tid] = MAX_LABEL;
-    fa[tid] = true;
-    xa[tid] = false;
+    if (filter_op(tid))
+      labels[tid] = tid + 1;
+    else
+      labels[tid] = MAX_LABEL;
   }
-}
-
-template <typename Index_, int TPB_X = 256, typename Lambda>
-void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind,
-                           const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                           WeakCCState *state, Index_ startVertexId,
-                           Index_ batchSize, cudaStream_t stream,
-                           Lambda filter_op) {
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
-         "Index_ should be 4 or 8 bytes");
-
-  bool host_m;
-
-  dim3 blocks(raft::ceildiv(batchSize, Index_(TPB_X)));
-  dim3 threads(TPB_X);
-  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
-
-  weak_cc_init_label_kernel<Index_, TPB_X><<<blocks, threads, 0, stream>>>(
-    labels, startVertexId, batchSize, MAX_LABEL, filter_op);
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  int n_iters = 0;
-  do {
-    CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
-
-    weak_cc_label_device<Index_, TPB_X><<<blocks, threads, 0, stream>>>(
-      labels, row_ind, row_ind_ptr, nnz, state->fa, state->xa, state->m,
-      startVertexId, batchSize, N, filter_op);
-    CUDA_CHECK(cudaPeekAtLastError());
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    //** swapping F1 and F2
-    std::swap(state->fa, state->xa);
-
-    //** Updating m *
-    raft::update_host(&host_m, state->m, 1, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    n_iters++;
-  } while (host_m);
-}
+}  // namespace sparse
 
 /**
- * @brief Compute weakly connected components. Note that the resulting labels
- * may not be taken from a monotonically increasing set (eg. numbers may be
- * skipped). The MLCommon::Label package contains a primitive `make_monotonic`,
- * which will make a monotonically increasing set of labels.
- *
- * This implementation comes from [1] and solves component labeling problem in
- * parallel on CSR-indexes based upon the vertex degree and adjacency graph.
- *
- * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA"
+ * @brief Partial calculation of the weakly connected components in the
+ * context of a batched algorithm: the labels are computed wrt the sub-graph
+ * represented by the given CSR matrix of dimensions batch_size * N.
+ * Note that this overwrites the labels array and it is the responsibility of
+ * the caller to combine the results from different batches
+ * (cf label/merge_labels.cuh)
  *
- * @tparam Type the numeric type of non-floating point elements
+ * @tparam Index_ the numeric type of non-floating point elements
  * @tparam TPB_X the threads to use per block when configuring the kernel
- * @tparam Lambda the type of an optional filter function (int)->bool
  * @param labels an array for the output labels
  * @param row_ind the compressed row index of the CSR array
  * @param row_ind_ptr the row index pointer of the CSR array
  * @param nnz the size of row_ind_ptr array
  * @param N number of vertices
- * @param startVertexId the starting vertex index for the current batch
- * @param batchSize number of vertices for current batch
+ * @param start_vertex_id the starting vertex index for the current batch
+ * @param batch_size number of vertices for current batch
  * @param state instance of inter-batch state management
  * @param stream the cuda stream to use
  * @param filter_op an optional filtering function to determine which points
@@ -198,54 +128,65 @@ template <typename Index_, int TPB_X = 256,
           typename Lambda = auto(Index_)->bool>
 void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
                      const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ startVertexId, Index_ batchSize, WeakCCState *state,
-                     cudaStream_t stream, Lambda filter_op) {
-  dim3 blocks(raft::ceildiv(N, Index_(TPB_X)));
-  dim3 threads(TPB_X);
+                     Index_ start_vertex_id, Index_ batch_size,
+                     WeakCCState *state, cudaStream_t stream,
+                     Lambda filter_op) {
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
+         "Index_ should be 4 or 8 bytes");
+
+  bool host_m;
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
-  if (startVertexId == 0) {
-    weak_cc_init_all_kernel<Index_, TPB_X><<<blocks, threads, 0, stream>>>(
-      labels, state->fa, state->xa, N, MAX_LABEL);
+  weak_cc_init_all_kernel<Index_, TPB_X>
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(
+      labels, N, MAX_LABEL, filter_op);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  int n_iters = 0;
+  do {
+    CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
+
+    weak_cc_label_device<Index_, TPB_X>
+      <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id,
+        batch_size, N, filter_op);
     CUDA_CHECK(cudaPeekAtLastError());
-  }
 
-  weak_cc_label_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N,
-                                       state, startVertexId, batchSize, stream,
-                                       filter_op);
+    //** Updating m *
+    raft::update_host(&host_m, state->m, 1, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    n_iters++;
+  } while (host_m);
 }
 
 /**
- * @brief Compute weakly connected components. Note that the resulting labels
- * may not be taken from a monotonically increasing set (eg. numbers may be
- * skipped). The MLCommon::Label package contains a primitive `make_monotonic`,
- * which will make a monotonically increasing set of labels.
- *
- * This implementation comes from [1] and solves component labeling problem in
- * parallel on CSR-indexes based upon the vertex degree and adjacency graph.
+ * @brief Partial calculation of the weakly connected components in the
+ * context of a batched algorithm: the labels are computed wrt the sub-graph
+ * represented by the given CSR matrix of dimensions batch_size * N.
+ * Note that this overwrites the labels array and it is the responsibility of
+ * the caller to combine the results from different batches
+ * (cf label/merge_labels.cuh)
  *
- * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA"
- *
- * @tparam Type the numeric type of non-floating point elements
+ * @tparam Index_ the numeric type of non-floating point elements
  * @tparam TPB_X the threads to use per block when configuring the kernel
- * @tparam Lambda the type of an optional filter function (int)->bool
  * @param labels an array for the output labels
  * @param row_ind the compressed row index of the CSR array
  * @param row_ind_ptr the row index pointer of the CSR array
  * @param nnz the size of row_ind_ptr array
  * @param N number of vertices
- * @param startVertexId the starting vertex index for the current batch
- * @param batchSize number of vertices for current batch
+ * @param start_vertex_id the starting vertex index for the current batch
+ * @param batch_size number of vertices for current batch
  * @param state instance of inter-batch state management
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
 void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
                      const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ startVertexId, Index_ batchSize, WeakCCState *state,
-                     cudaStream_t stream) {
-  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, startVertexId,
-                  batchSize, state, stream,
+                     Index_ start_vertex_id, Index_ batch_size,
+                     WeakCCState *state, cudaStream_t stream) {
+  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id,
+                  batch_size, state, stream,
                   [] __device__(Index_ tid) { return true; });
 }
 
@@ -279,11 +220,9 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
              Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream, Lambda filter_op) {
-  MLCommon::device_buffer<bool> xa(d_alloc, stream, N);
-  MLCommon::device_buffer<bool> fa(d_alloc, stream, N);
   MLCommon::device_buffer<bool> m(d_alloc, stream, 1);
 
-  WeakCCState state(xa.data(), fa.data(), m.data());
+  WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, filter_op);
 }
@@ -315,10 +254,8 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
              Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
              cudaStream_t stream) {
-  MLCommon::device_buffer<bool> xa(d_alloc, stream, N);
-  MLCommon::device_buffer<bool> fa(d_alloc, stream, N);
   MLCommon::device_buffer<bool> m(d_alloc, stream, 1);
-  WeakCCState state(xa.data(), fa.data(), m.data());
+  WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, [](Index_) { return true; });
 }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 25cdde02d1..9130c5e450 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -175,6 +175,7 @@ if(BUILD_PRIMS_TESTS)
     prims/make_arima.cu
     prims/make_blobs.cu
     prims/make_regression.cu
+    prims/merge_labels.cu
     prims/minmax.cu
     prims/mvg.cu
     prims/mutual_info_score.cu
diff --git a/cpp/test/prims/csr.cu b/cpp/test/prims/csr.cu
index 9b2c3c3ca2..303fa5cd46 100644
--- a/cpp/test/prims/csr.cu
+++ b/cpp/test/prims/csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #include <gtest/gtest.h>
 #include <sparse/csr.cuh>
-#include "csr.h"
 
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.cuh>
@@ -24,86 +23,186 @@
 
 #include <iostream>
 #include <limits>
+#include <vector>
+
+constexpr int MAX32 = std::numeric_limits<int>::max();
+constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
 
 namespace raft {
 namespace sparse {
 
-template <typename T>
-class CSRTest : public ::testing::TestWithParam<CSRInputs<T>> {
- protected:
-  void SetUp() override {}
-
-  void TearDown() override {}
-
- protected:
-  CSRInputs<T> params;
+template <typename Index_>
+struct CSRMatrix {
+  std::vector<Index_> row_ind;
+  std::vector<Index_> row_ind_ptr;
 };
 
-const std::vector<CSRInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
-
-typedef CSRTest<float> WeakCCTest;
-TEST_P(WeakCCTest, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  std::shared_ptr<MLCommon::deviceAllocator> alloc(
-    new raft::mr::device::default_allocator);
-  int *row_ind, *row_ind_ptr, *result, *verify;
-
-  int row_ind_h1[3] = {0, 3, 6};
-  int row_ind_ptr_h1[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
-  int verify_h1[6] = {1, 1, 1, 2147483647, 2147483647, 2147483647};
-
-  int row_ind_h2[3] = {0, 2, 4};
-  int row_ind_ptr_h2[5] = {3, 4, 3, 4, 5};
-  int verify_h2[6] = {1, 1, 1, 5, 5, 5};
-
-  raft::allocate(row_ind, 3);
-  raft::allocate(row_ind_ptr, 9);
-  raft::allocate(result, 9, true);
-  raft::allocate(verify, 9);
-
-  MLCommon::device_buffer<bool> xa(alloc, stream, 6);
-  MLCommon::device_buffer<bool> fa(alloc, stream, 6);
-  MLCommon::device_buffer<bool> m(alloc, stream, 1);
-  WeakCCState state(xa.data(), fa.data(), m.data());
-
-  /**
-     * Run batch #1
-     */
-  raft::update_device(row_ind, *&row_ind_h1, 3, stream);
-  raft::update_device(row_ind_ptr, *&row_ind_ptr_h1, 9, stream);
-  raft::update_device(verify, *&verify_h1, 6, stream);
-
-  weak_cc_batched<int, 32>(result, row_ind, row_ind_ptr, 9, 6, 0, 3, &state,
-                           stream);
-
-  cudaStreamSynchronize(stream);
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, result, 6, raft::Compare<int>()));
-
-  /**
-     * Run batch #2
-     */
-  raft::update_device(row_ind, *&row_ind_h2, 3, stream);
-  raft::update_device(row_ind_ptr, *&row_ind_ptr_h2, 5, stream);
-  raft::update_device(verify, *&verify_h2, 6, stream);
-
-  weak_cc_batched<int, 32>(result, row_ind, row_ind_ptr, 5, 6, 4, 3, &state,
-                           stream);
-
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, result, 6, raft::Compare<int>()));
+template <typename Index_>
+struct WeakCCInputs {
+  Index_ N;
+  std::vector<int8_t> mask;
+  std::vector<CSRMatrix<Index_>> batches;
+  std::vector<std::vector<Index_>> verify;
+};
 
-  cudaStreamSynchronize(stream);
+/** Wrapper to call weakcc because the enclosing function of a __device__
+ *  lambda cannot have private ot protected access within the class. */
+template <typename Index_>
+void weak_cc_wrapper(Index_ *labels, const Index_ *row_ind,
+                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
+                     Index_ startVertexId, Index_ batchSize, WeakCCState *state,
+                     cudaStream_t stream, bool *mask) {
+  weak_cc_batched<Index_>(
+    labels, row_ind, row_ind_ptr, nnz, N, startVertexId, batchSize, state,
+    stream, [mask, N] __device__(Index_ global_id) {
+      return global_id < N ? __ldg((char *)mask + global_id) : 0;
+    });
+}
 
-  cudaStreamDestroy(stream);
+template <typename Index_>
+class WeakCCTest : public ::testing::TestWithParam<WeakCCInputs<Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<WeakCCInputs<Index_>>::GetParam();
+
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    std::shared_ptr<MLCommon::deviceAllocator> alloc(
+      new raft::mr::device::default_allocator);
+
+    Index_ row_ind_size = params.batches[0].row_ind.size();
+    Index_ row_ind_ptr_size = params.batches[0].row_ind_ptr.size();
+    for (int i = 1; i < params.batches.size(); i++) {
+      row_ind_size =
+        max(row_ind_size, (Index_)params.batches[i].row_ind.size());
+      row_ind_ptr_size =
+        max(row_ind_ptr_size, (Index_)params.batches[i].row_ind_ptr.size());
+    }
+
+    raft::allocate(row_ind, row_ind_size);
+    raft::allocate(row_ind_ptr, row_ind_ptr_size);
+    raft::allocate(result, params.N, true);
+    raft::allocate(verify, params.N);
+    raft::allocate(mask, params.N);
+    raft::allocate(m, 1);
+  }
+
+  void Run() {
+    params = ::testing::TestWithParam<WeakCCInputs<Index_>>::GetParam();
+    Index_ N = params.N;
+
+    WeakCCState state(m);
+
+    raft::update_device(mask, reinterpret_cast<bool *>(params.mask.data()), N,
+                        stream);
+
+    Index_ start_id = 0;
+    for (int i = 0; i < params.batches.size(); i++) {
+      Index_ batch_size = params.batches[i].row_ind.size() - 1;
+      Index_ row_ind_size = params.batches[i].row_ind.size();
+      Index_ row_ind_ptr_size = params.batches[i].row_ind_ptr.size();
+
+      raft::update_device(row_ind, params.batches[i].row_ind.data(),
+                          row_ind_size, stream);
+      raft::update_device(row_ind_ptr, params.batches[i].row_ind_ptr.data(),
+                          row_ind_ptr_size, stream);
+      raft::update_device(verify, params.verify[i].data(), N, stream);
+
+      weak_cc_wrapper<Index_>(result, row_ind, row_ind_ptr, row_ind_ptr_size, N,
+                              start_id, batch_size, &state, stream, mask);
+
+      cudaStreamSynchronize(stream);
+      ASSERT_TRUE(
+        raft::devArrMatch<Index_>(verify, result, N, raft::Compare<Index_>()));
+
+      start_id += batch_size;
+    }
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(row_ind));
+    CUDA_CHECK(cudaFree(row_ind_ptr));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    CUDA_CHECK(cudaFree(mask));
+    CUDA_CHECK(cudaFree(m));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
 
-  CUDA_CHECK(cudaFree(row_ind));
-  CUDA_CHECK(cudaFree(row_ind_ptr));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
+ protected:
+  WeakCCInputs<Index_> params;
+  cudaStream_t stream;
+  Index_ *row_ind, *row_ind_ptr, *result, *verify;
+  bool *mask, *m;
+};
 
-INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTest, ::testing::ValuesIn(inputsf));
+using WeakCCTestI = WeakCCTest<int>;
+TEST_P(WeakCCTestI, Result) { Run(); }
+
+using WeakCCTestL = WeakCCTest<int64_t>;
+TEST_P(WeakCCTestL, Result) { Run(); }
+
+// Hand-designed corner cases for weakcc
+const std::vector<WeakCCInputs<int>> weakcc_inputs_32 = {
+  {6,
+   {1, 0, 1, 1, 1, 0},
+   {{{0, 2, 5, 7}, {0, 1, 0, 1, 4, 2, 5}},
+    {{0, 2, 5, 7}, {3, 4, 1, 3, 4, 2, 5}}},
+   {{1, 1, 3, 4, 5, 3}, {1, 4, 3, 4, 4, 3}}},
+  {6,
+   {1, 0, 1, 0, 1, 0},
+   {{{0, 5, 8}, {0, 1, 2, 3, 4, 0, 1, 4}},
+    {{0, 5, 8}, {0, 2, 3, 4, 5, 0, 2, 3}},
+    {{0, 5, 8}, {0, 1, 2, 4, 5, 2, 4, 5}}},
+   {{1, 1, 1, 1, 1, MAX32}, {1, MAX32, 1, 1, 1, 1}, {1, 1, 1, MAX32, 1, 1}}},
+  {6,
+   {1, 1, 1, 0, 1, 1},
+   {{{0, 3, 6}, {0, 1, 2, 0, 1, 3}},
+    {{0, 3, 6}, {0, 2, 4, 1, 3, 5}},
+    {{0, 3, 6}, {2, 4, 5, 3, 4, 5}}},
+   {{1, 1, 1, 1, 5, 6}, {1, 2, 1, 2, 1, 6}, {1, 2, 3, 3, 3, 3}}},
+  {8,
+   {1, 1, 1, 1, 0, 0, 1, 1},
+   {{{0, 2, 5}, {0, 1, 0, 1, 2}},
+    {{0, 3, 6}, {1, 2, 3, 2, 3, 4}},
+    {{0, 2, 4}, {3, 4, 5, 6}},
+    {{0, 2, 5}, {5, 6, 7, 6, 7}}},
+   {{1, 1, 1, 4, MAX32, MAX32, 7, 8},
+    {1, 2, 2, 2, 2, MAX32, 7, 8},
+    {1, 2, 3, 4, 4, 7, 7, 8},
+    {1, 2, 3, 4, MAX32, 7, 7, 7}}}};
+const std::vector<WeakCCInputs<int64_t>> weakcc_inputs_64 = {
+  {6,
+   {1, 0, 1, 1, 1, 0},
+   {{{0, 2, 5, 7}, {0, 1, 0, 1, 4, 2, 5}},
+    {{0, 2, 5, 7}, {3, 4, 1, 3, 4, 2, 5}}},
+   {{1, 1, 3, 4, 5, 3}, {1, 4, 3, 4, 4, 3}}},
+  {6,
+   {1, 0, 1, 0, 1, 0},
+   {{{0, 5, 8}, {0, 1, 2, 3, 4, 0, 1, 4}},
+    {{0, 5, 8}, {0, 2, 3, 4, 5, 0, 2, 3}},
+    {{0, 5, 8}, {0, 1, 2, 4, 5, 2, 4, 5}}},
+   {{1, 1, 1, 1, 1, MAX64}, {1, MAX64, 1, 1, 1, 1}, {1, 1, 1, MAX64, 1, 1}}},
+  {6,
+   {1, 1, 1, 0, 1, 1},
+   {{{0, 3, 6}, {0, 1, 2, 0, 1, 3}},
+    {{0, 3, 6}, {0, 2, 4, 1, 3, 5}},
+    {{0, 3, 6}, {2, 4, 5, 3, 4, 5}}},
+   {{1, 1, 1, 1, 5, 6}, {1, 2, 1, 2, 1, 6}, {1, 2, 3, 3, 3, 3}}},
+  {8,
+   {1, 1, 1, 1, 0, 0, 1, 1},
+   {{{0, 2, 5}, {0, 1, 0, 1, 2}},
+    {{0, 3, 6}, {1, 2, 3, 2, 3, 4}},
+    {{0, 2, 4}, {3, 4, 5, 6}},
+    {{0, 2, 5}, {5, 6, 7, 6, 7}}},
+   {{1, 1, 1, 4, MAX64, MAX64, 7, 8},
+    {1, 2, 2, 2, 2, MAX64, 7, 8},
+    {1, 2, 3, 4, 4, 7, 7, 8},
+    {1, 2, 3, 4, MAX64, 7, 7, 7}}}};
+
+INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTestI,
+                        ::testing::ValuesIn(weakcc_inputs_32));
+INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTestL,
+                        ::testing::ValuesIn(weakcc_inputs_64));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/prims/csr.h b/cpp/test/prims/csr.h
deleted file mode 100644
index fedeae91e0..0000000000
--- a/cpp/test/prims/csr.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace raft {
-
-namespace sparse {
-
-template <typename T>
-struct CSRInputs {
-  int m, n, nnz;
-  unsigned long long int seed;
-};
-
-template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const CSRInputs<T> &dims) {
-  return os;
-}
-}  // namespace sparse
-}  // namespace raft
diff --git a/cpp/test/prims/merge_labels.cu b/cpp/test/prims/merge_labels.cu
new file mode 100644
index 0000000000..e53a60f7fb
--- /dev/null
+++ b/cpp/test/prims/merge_labels.cu
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <label/merge_labels.cuh>
+
+#include <raft/cudart_utils.h>
+#include <thrust/device_ptr.h>
+#include <cuml/common/cuml_allocator.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include "test_utils.h"
+
+#include <vector>
+
+namespace MLCommon {
+namespace Label {
+
+template <typename Index_>
+struct MergeLabelsInputs {
+  Index_ N;
+  std::vector<Index_> labels_a;
+  std::vector<Index_> labels_b;
+  std::vector<uint8_t> mask;  // to avoid std::vector<bool> optimization
+  std::vector<Index_> expected;
+};
+
+template <typename Index_>
+class MergeLabelsTest
+  : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
+ protected:
+  MergeLabelsTest()
+    : params(::testing::TestWithParam<MergeLabelsInputs<Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      labels_a(params.N, stream),
+      labels_b(params.N, stream),
+      expected(params.N, stream),
+      R(params.N, stream),
+      mask(params.N, stream),
+      m(1, stream) {}
+
+  void Run() {
+    raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
+                        stream);
+    raft::update_device(labels_b.data(), params.labels_b.data(), params.N,
+                        stream);
+    raft::update_device(expected.data(), params.expected.data(), params.N,
+                        stream);
+    raft::update_device(mask.data(),
+                        reinterpret_cast<bool *>(params.mask.data()), params.N,
+                        stream);
+
+    merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(),
+                 m.data(), params.N, stream);
+
+    cudaStreamSynchronize(stream);
+    ASSERT_TRUE(raft::devArrMatch<Index_>(expected.data(), labels_a.data(),
+                                          params.N, raft::Compare<Index_>()));
+  }
+
+ protected:
+  MergeLabelsInputs<Index_> params;
+  raft::handle_t handle;
+  cudaStream_t stream;
+  rmm::device_uvector<Index_> labels_a, labels_b, expected, R;
+  rmm::device_uvector<bool> mask, m;
+};
+
+using MergeLabelsTestI = MergeLabelsTest<int>;
+TEST_P(MergeLabelsTestI, Result) { Run(); }
+
+using MergeLabelsTestL = MergeLabelsTest<int64_t>;
+TEST_P(MergeLabelsTestL, Result) { Run(); }
+
+constexpr int MAX32 = std::numeric_limits<int>::max();
+constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
+
+const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
+  {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
+  {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+  {6,
+   {1, 2, 1, 4, 5, MAX32},
+   {1, 2, MAX32, 4, 5, 4},
+   {1, 1, 0, 1, 1, 0},
+   {1, 2, 1, 4, 5, 4}},
+  {6,
+   {1, 2, 2, 2, 2, 6},
+   {1, 1, 1, 5, 5, 5},
+   {1, 1, 1, 1, 1, 1},
+   {1, 1, 1, 1, 1, 1}},
+  {8,
+   {1, 1, 3, 3, MAX32, 1, 3, MAX32},
+   {1, 2, 3, 2, MAX32, 2, 2, 2},
+   {1, 1, 1, 1, 0, 1, 1, 0},
+   {1, 1, 1, 1, MAX32, 1, 1, 1}},
+  {8,
+   {1, 1, 1, 4, 4, 7, 7, 8},
+   {1, 2, 2, 2, 2, 7, 7, 7},
+   {1, 1, 1, 1, 0, 0, 1, 1},
+   {1, 1, 1, 1, 1, 7, 7, 7}},
+};
+
+const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
+  {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
+  {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
+  {6,
+   {1, 2, 1, 4, 5, MAX64},
+   {1, 2, MAX64, 4, 5, 4},
+   {1, 1, 0, 1, 1, 0},
+   {1, 2, 1, 4, 5, 4}},
+  {6,
+   {1, 2, 2, 2, 2, 6},
+   {1, 1, 1, 5, 5, 5},
+   {1, 1, 1, 1, 1, 1},
+   {1, 1, 1, 1, 1, 1}},
+  {8,
+   {1, 1, 3, 3, MAX64, 1, 3, MAX64},
+   {1, 2, 3, 2, MAX64, 2, 2, 2},
+   {1, 1, 1, 1, 0, 1, 1, 0},
+   {1, 1, 1, 1, MAX64, 1, 1, 1}},
+  {8,
+   {1, 1, 1, 4, 4, 7, 7, 8},
+   {1, 2, 2, 2, 2, 7, 7, 7},
+   {1, 1, 1, 1, 0, 0, 1, 1},
+   {1, 1, 1, 1, 1, 7, 7, 7}},
+};
+
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI,
+                        ::testing::ValuesIn(merge_inputs_32));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL,
+                        ::testing::ValuesIn(merge_inputs_64));
+
+}  // namespace Label
+}  // namespace MLCommon
diff --git a/cpp/test/prims/sparse/add.cu b/cpp/test/prims/sparse/add.cu
index d4a5bd9166..b6b7c9fa90 100644
--- a/cpp/test/prims/sparse/add.cu
+++ b/cpp/test/prims/sparse/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,99 +29,146 @@
 namespace raft {
 namespace sparse {
 
-template <typename T>
-struct SparseAddInputs {
-  int m, n, nnz;
-  unsigned long long int seed;
+template <typename Type_f, typename Index_>
+struct CSRMatrixVal {
+  std::vector<Index_> row_ind;
+  std::vector<Index_> row_ind_ptr;
+  std::vector<Type_f> values;
 };
 
-template <typename T>
-class SparseAddTest : public ::testing::TestWithParam<SparseAddInputs<T>> {
- protected:
-  void SetUp() override {}
-
-  void TearDown() override {}
-
- protected:
-  SparseAddInputs<T> params;
+template <typename Type_f, typename Index_>
+struct CSRAddInputs {
+  CSRMatrixVal<Type_f, Index_> matrix_a;
+  CSRMatrixVal<Type_f, Index_> matrix_b;
+  CSRMatrixVal<Type_f, Index_> matrix_verify;
 };
 
-const std::vector<SparseAddInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+template <typename Type_f, typename Index_>
+class CSRAddTest
+  : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
+    n_rows = params.matrix_a.row_ind.size();
+    nnz_a = params.matrix_a.row_ind_ptr.size();
+    nnz_b = params.matrix_b.row_ind_ptr.size();
+    nnz_result = params.matrix_verify.row_ind_ptr.size();
+
+    cudaStreamCreate(&stream);
+
+    raft::allocate(ind_a, n_rows);
+    raft::allocate(ind_ptr_a, nnz_a);
+    raft::allocate(values_a, nnz_a);
+
+    raft::allocate(ind_b, n_rows);
+    raft::allocate(ind_ptr_b, nnz_b);
+    raft::allocate(values_b, nnz_b);
+
+    raft::allocate(ind_verify, n_rows);
+    raft::allocate(ind_ptr_verify, nnz_result);
+    raft::allocate(values_verify, nnz_result);
+
+    raft::allocate(ind_result, n_rows);
+    raft::allocate(ind_ptr_result, nnz_result);
+    raft::allocate(values_result, nnz_result);
+  }
+
+  void Run() {
+    std::shared_ptr<MLCommon::deviceAllocator> alloc(
+      new raft::mr::device::default_allocator);
+
+    raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
+                        stream);
+    raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream);
+
+    raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b,
+                        stream);
+    raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream);
+
+    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows,
+                        stream);
+    raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(),
+                        nnz_result, stream);
+    raft::update_device(values_verify, params.matrix_verify.values.data(),
+                        nnz_result, stream);
+
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
+      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
+      n_rows, ind_result, alloc, stream);
+
+    ASSERT_TRUE(nnz == nnz_result);
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
+                                          raft::Compare<Index_>()));
+
+    linalg::csr_add_finalize<Type_f, 32>(
+      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
+      n_rows, ind_result, ind_ptr_result, values_result, stream);
+
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz,
+                                          raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify, values_result, nnz,
+                                          raft::Compare<Type_f>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ind_a));
+    CUDA_CHECK(cudaFree(ind_b));
+    CUDA_CHECK(cudaFree(ind_result));
+    CUDA_CHECK(cudaFree(ind_ptr_a));
+    CUDA_CHECK(cudaFree(ind_ptr_b));
+    CUDA_CHECK(cudaFree(ind_ptr_verify));
+    CUDA_CHECK(cudaFree(ind_ptr_result));
+    CUDA_CHECK(cudaFree(values_a));
+    CUDA_CHECK(cudaFree(values_b));
+    CUDA_CHECK(cudaFree(values_verify));
+    CUDA_CHECK(cudaFree(values_result));
+    cudaStreamDestroy(stream);
+  }
 
-typedef SparseAddTest<float> CSRSum;
-TEST_P(CSRSum, Result) {
+ protected:
+  CSRAddInputs<Type_f, Index_> params;
   cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
-
-  int *ex_scan, *ind_ptr_a, *ind_ptr_b, *verify_indptr;
-  float *in_vals_a, *in_vals_b, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-
-  int indptr_a_h[10] = {1, 2, 3, 4, 1, 2, 3, 5, 0, 1};
-  int indptr_b_h[10] = {1, 2, 5, 4, 0, 2, 3, 5, 1, 0};
-
-  float in_vals_h[10] = {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0};
-
-  float verify_h[14] = {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0,
-                        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  int verify_indptr_h[14] = {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0};
-
-  raft::allocate(in_vals_a, 10);
-  raft::allocate(in_vals_b, 10);
-  raft::allocate(verify, 14);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(verify_indptr, 14);
-
-  raft::allocate(ind_ptr_a, 10);
-  raft::allocate(ind_ptr_b, 10);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
-  raft::update_device(in_vals_a, *&in_vals_h, 10, stream);
-  raft::update_device(in_vals_b, *&in_vals_h, 10, stream);
-  raft::update_device(verify, *&verify_h, 14, stream);
-  raft::update_device(verify_indptr, *&verify_indptr_h, 14, stream);
-  raft::update_device(ind_ptr_a, *&indptr_a_h, 10, stream);
-  raft::update_device(ind_ptr_b, *&indptr_b_h, 10, stream);
-
-  int *result_ind;
-  raft::allocate(result_ind, 4);
-
-  int nnz = linalg::csr_add_calc_inds<float, 32>(
-    ex_scan, ind_ptr_a, in_vals_a, 10, ex_scan, ind_ptr_b, in_vals_b, 10, 4,
-    result_ind, alloc, stream);
-
-  int *result_indptr;
-  float *result_val;
-  raft::allocate(result_indptr, nnz);
-  raft::allocate(result_val, nnz);
-
-  linalg::csr_add_finalize<float, 32>(
-    ex_scan, ind_ptr_a, in_vals_a, 10, ex_scan, ind_ptr_b, in_vals_b, 10, 4,
-    result_ind, result_indptr, result_val, stream);
-
-  ASSERT_TRUE(nnz == 14);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result_val, nnz, raft::Compare<float>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(verify_indptr, result_indptr, nnz,
-                                     raft::Compare<int>()));
+  Index_ n_rows, nnz_a, nnz_b, nnz_result;
+  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b,
+    *ind_ptr_verify, *ind_ptr_result;
+  Type_f *values_a, *values_b, *values_verify, *values_result;
+};
 
-  cudaStreamDestroy(stream);
+using CSRAddTestF = CSRAddTest<float, int>;
+TEST_P(CSRAddTestF, Result) { Run(); }
+
+using CSRAddTestD = CSRAddTest<double, int>;
+TEST_P(CSRAddTestD, Result) { Run(); }
+
+const std::vector<CSRAddInputs<float, int>> csradd_inputs_f = {
+  {{{0, 4, 8, 9},
+    {1, 2, 3, 4, 1, 2, 3, 5, 0, 1},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 4, 8, 9},
+    {1, 2, 5, 4, 0, 2, 3, 5, 1, 0},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 5, 10, 12},
+    {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0},
+    {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
+};
+const std::vector<CSRAddInputs<double, int>> csradd_inputs_d = {
+  {{{0, 4, 8, 9},
+    {1, 2, 3, 4, 1, 2, 3, 5, 0, 1},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 4, 8, 9},
+    {1, 2, 5, 4, 0, 2, 3, 5, 1, 0},
+    {1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0}},
+   {{0, 5, 10, 12},
+    {1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0},
+    {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
+};
 
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(in_vals_a));
-  CUDA_CHECK(cudaFree(in_vals_b));
-  CUDA_CHECK(cudaFree(ind_ptr_a));
-  CUDA_CHECK(cudaFree(ind_ptr_b));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result_indptr));
-  CUDA_CHECK(cudaFree(result_val));
-}
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF,
+                        ::testing::ValuesIn(csradd_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD,
+                        ::testing::ValuesIn(csradd_inputs_d));
 
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRSum, ::testing::ValuesIn(inputsf));
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/prims/sparse/convert_coo.cu b/cpp/test/prims/sparse/convert_coo.cu
index a44023e20a..709ab73822 100644
--- a/cpp/test/prims/sparse/convert_coo.cu
+++ b/cpp/test/prims/sparse/convert_coo.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,62 +30,69 @@
 namespace raft {
 namespace sparse {
 
-template <typename T>
-struct SparseConvertCOOInputs {
-  int m, n, nnz;
-  unsigned long long int seed;
+template <typename Index_>
+struct CSRtoCOOInputs {
+  std::vector<Index_> ex_scan;
+  std::vector<Index_> verify;
 };
 
-template <typename T>
-class SparseConvertCOOTest
-  : public ::testing::TestWithParam<SparseConvertCOOInputs<T>> {
+template <typename Index_>
+class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
  protected:
-  void SetUp() override {}
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
 
-  void TearDown() override {}
+    cudaStreamCreate(&stream);
+    raft::allocate(ex_scan, params.ex_scan.size());
+    raft::allocate(verify, params.verify.size());
+    raft::allocate(result, params.verify.size(), true);
+  }
 
- protected:
-  SparseConvertCOOInputs<T> params;
-};
-
-const std::vector<SparseConvertCOOInputs<float>> inputsf = {
-  {5, 10, 5, 1234ULL}};
-
-typedef SparseConvertCOOTest<float> CSRToCOO;
-TEST_P(CSRToCOO, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *ex_scan;
-  int *result, *verify;
+  void Run() {
+    Index_ n_rows = params.ex_scan.size();
+    Index_ nnz = params.verify.size();
 
-  int *ex_scan_h = new int[4]{0, 4, 8, 9};
-  int *verify_h = new int[10]{0, 0, 0, 0, 1, 1, 1, 1, 2, 3};
+    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
 
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
+    convert::csr_to_coo<Index_, 32>(ex_scan, n_rows, result, nnz, stream);
 
-  raft::update_device(ex_scan, ex_scan_h, 4, stream);
-  raft::update_device(verify, verify_h, 10, stream);
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz,
+                                          raft::Compare<float>(), stream));
+  }
 
-  convert::csr_to_coo<int, 32>(ex_scan, 4, result, 10, stream);
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ex_scan));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
 
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(verify, result, 10, raft::Compare<float>(), stream));
+ protected:
+  CSRtoCOOInputs<Index_> params;
+  cudaStream_t stream;
+  Index_ *ex_scan, *verify, *result;
+};
 
-  delete[] ex_scan_h;
-  delete[] verify_h;
+using CSRtoCOOTestI = CSRtoCOOTest<int>;
+TEST_P(CSRtoCOOTestI, Result) { Run(); }
 
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
+using CSRtoCOOTestL = CSRtoCOOTest<int64_t>;
+TEST_P(CSRtoCOOTestL, Result) { Run(); }
 
-  cudaStreamDestroy(stream);
-}
+const std::vector<CSRtoCOOInputs<int>> csrtocoo_inputs_32 = {
+  {{0, 0, 2, 2}, {1, 1, 3}},
+  {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
+};
+const std::vector<CSRtoCOOInputs<int64_t>> csrtocoo_inputs_64 = {
+  {{0, 0, 2, 2}, {1, 1, 3}},
+  {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
+};
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRToCOO,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI,
+                        ::testing::ValuesIn(csrtocoo_inputs_32));
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL,
+                        ::testing::ValuesIn(csrtocoo_inputs_64));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/prims/sparse/convert_csr.cu b/cpp/test/prims/sparse/convert_csr.cu
index 0470f68c47..5ee86f00bd 100644
--- a/cpp/test/prims/sparse/convert_csr.cu
+++ b/cpp/test/prims/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 namespace raft {
 namespace sparse {
 
+/**************************** sorted COO to CSR ****************************/
+
 template <typename T>
 struct SparseConvertCSRInputs {
   int m, n, nnz;
@@ -90,45 +92,89 @@ TEST_P(SortedCOOToCSR, Result) {
   CUDA_CHECK(cudaFree(out));
 }
 
-typedef SparseConvertCSRTest<float> AdjGraphTest;
-TEST_P(AdjGraphTest, Result) {
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *row_ind, *result, *verify;
-  bool *adj;
-
-  int row_ind_h[3] = {0, 3, 6};
-  bool adj_h[18] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
+                        ::testing::ValuesIn(inputsf));
 
-  int verify_h[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+/******************************** adj graph ********************************/
 
-  raft::allocate(row_ind, 3);
-  raft::allocate(adj, 18);
-  raft::allocate(result, 9, true);
-  raft::allocate(verify, 9);
+template <typename Index_>
+struct CSRAdjGraphInputs {
+  Index_ n_rows;
+  Index_ n_cols;
+  std::vector<Index_> row_ind;
+  std::vector<uint8_t> adj;  // To avoid vector<bool> optimization
+  std::vector<Index_> verify;
+};
 
-  raft::update_device(row_ind, *&row_ind_h, 3, stream);
-  raft::update_device(adj, *&adj_h, 18, stream);
-  raft::update_device(verify, *&verify_h, 9, stream);
+template <typename Index_>
+class CSRAdjGraphTest
+  : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam();
+    cudaStreamCreate(&stream);
+    nnz = params.verify.size();
+
+    raft::allocate(row_ind, params.n_rows);
+    raft::allocate(adj, params.n_rows * params.n_cols);
+    raft::allocate(result, nnz, true);
+    raft::allocate(verify, nnz);
+  }
+
+  void Run() {
+    raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream);
+    raft::update_device(adj, reinterpret_cast<bool *>(params.adj.data()),
+                        params.n_rows * params.n_cols, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    convert::csr_adj_graph_batched<Index_, 32>(
+      row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream);
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(row_ind));
+    CUDA_CHECK(cudaFree(adj));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    cudaStreamDestroy(stream);
+  }
 
-  convert::csr_adj_graph_batched<int, 32>(row_ind, 6, 9, 3, adj, result,
-                                          stream);
+ protected:
+  CSRAdjGraphInputs<Index_> params;
+  cudaStream_t stream;
+  Index_ nnz;
+  Index_ *row_ind, *result, *verify;
+  bool *adj;
+};
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, result, 9, raft::Compare<int>()));
+using CSRAdjGraphTestI = CSRAdjGraphTest<int>;
+TEST_P(CSRAdjGraphTestI, Result) { Run(); }
 
-  cudaStreamDestroy(stream);
+using CSRAdjGraphTestL = CSRAdjGraphTest<int64_t>;
+TEST_P(CSRAdjGraphTestL, Result) { Run(); }
 
-  CUDA_CHECK(cudaFree(row_ind));
-  CUDA_CHECK(cudaFree(adj));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
+const std::vector<CSRAdjGraphInputs<int>> csradjgraph_inputs_i = {
+  {3,
+   6,
+   {0, 3, 6},
+   {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+   {0, 1, 2, 0, 1, 2, 0, 1, 2}},
+};
+const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
+  {3,
+   6,
+   {0, 3, 6},
+   {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+   {0, 1, 2, 0, 1, 2, 0, 1, 2}},
+};
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
-                        ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, AdjGraphTest,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI,
+                        ::testing::ValuesIn(csradjgraph_inputs_i));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL,
+                        ::testing::ValuesIn(csradjgraph_inputs_l));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/prims/sparse/norm.cu b/cpp/test/prims/sparse/norm.cu
index 6fdf37aaa0..21fa3ef3b4 100644
--- a/cpp/test/prims/sparse/norm.cu
+++ b/cpp/test/prims/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,97 +28,100 @@
 namespace raft {
 namespace sparse {
 
-template <typename T>
-struct SparseNormInputs {
-  int m, n, nnz;
-  unsigned long long int seed;
+enum NormalizeMethod { MAX, L1 };
+
+template <typename Type_f, typename Index_>
+struct CSRRowNormalizeInputs {
+  NormalizeMethod method;
+  std::vector<Index_> ex_scan;
+  std::vector<Type_f> in_vals;
+  std::vector<Type_f> verify;
 };
 
-template <typename T>
-class SparseNormTest : public ::testing::TestWithParam<SparseNormInputs<T>> {
+template <typename Type_f, typename Index_>
+class CSRRowNormalizeTest
+  : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override {}
-
-  void TearDown() override {}
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
+    cudaStreamCreate(&stream);
+
+    raft::allocate(in_vals, params.in_vals.size());
+    raft::allocate(verify, params.verify.size());
+    raft::allocate(ex_scan, params.ex_scan.size());
+    raft::allocate(result, params.verify.size(), true);
+  }
+
+  void Run() {
+    Index_ n_rows = params.ex_scan.size();
+    Index_ nnz = params.in_vals.size();
+
+    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
+    raft::update_device(in_vals, params.in_vals.data(), nnz, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    switch (params.method) {
+      case MAX:
+        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
+                                                  result, stream);
+        break;
+      case L1:
+        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
+                                                 result, stream);
+        break;
+    }
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ex_scan));
+    CUDA_CHECK(cudaFree(in_vals));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    cudaStreamDestroy(stream);
+  }
 
  protected:
-  SparseNormInputs<T> params;
-};
-
-const std::vector<SparseNormInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
-
-typedef SparseNormTest<float> CSRRowNormalizeMax;
-TEST_P(CSRRowNormalizeMax, Result) {
+  CSRRowNormalizeInputs<Type_f, Index_> params;
   cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *ex_scan;
-  float *in_vals, *result, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-  float in_vals_h[10] = {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0};
-
-  float verify_h[10] = {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0};
-
-  raft::allocate(in_vals, 10);
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
-  raft::update_device(in_vals, *&in_vals_h, 10, stream);
-  raft::update_device(verify, *&verify_h, 10, stream);
-
-  linalg::csr_row_normalize_max<32, float>(ex_scan, in_vals, 10, 4, result,
-                                           stream);
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
-
-  cudaStreamDestroy(stream);
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(in_vals));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
-
-typedef SparseNormTest<float> CSRRowNormalizeL1;
-TEST_P(CSRRowNormalizeL1, Result) {
-  int *ex_scan;
-  float *in_vals, *result, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-  float in_vals_h[10] = {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0};
-
-  float verify_h[10] = {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0};
-
-  raft::allocate(in_vals, 10);
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, 0);
-  raft::update_device(in_vals, *&in_vals_h, 10, 0);
-  raft::update_device(verify, *&verify_h, 10, 0);
-
-  linalg::csr_row_normalize_l1<32, float>(ex_scan, in_vals, 10, 4, result, 0);
-  cudaDeviceSynchronize();
-
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
-
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(in_vals));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
+  Index_ *ex_scan;
+  Type_f *in_vals, *result, *verify;
+};
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeMax,
-                        ::testing::ValuesIn(inputsf));
+using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
+TEST_P(CSRRowNormalizeTestF, Result) { Run(); }
+
+using CSRRowNormalizeTestD = CSRRowNormalizeTest<double, int>;
+TEST_P(CSRRowNormalizeTestD, Result) { Run(); }
+
+const std::vector<CSRRowNormalizeInputs<float, int>> csrnormalize_inputs_f = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeL1,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF,
+                        ::testing::ValuesIn(csrnormalize_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD,
+                        ::testing::ValuesIn(csrnormalize_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/prims/sparse/row_op.cu b/cpp/test/prims/sparse/row_op.cu
index 46ab3bfb5f..e64c6059aa 100644
--- a/cpp/test/prims/sparse/row_op.cu
+++ b/cpp/test/prims/sparse/row_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,63 +29,83 @@
 namespace raft {
 namespace sparse {
 
-template <typename T>
-struct SparseRowOpInputs {
-  int m, n, nnz;
-  unsigned long long int seed;
+template <typename Type_f, typename Index_>
+struct CSRRowOpInputs {
+  std::vector<Index_> ex_scan;
+  std::vector<Type_f> verify;
 };
 
-template <typename T>
-class SparseRowOpTest : public ::testing::TestWithParam<SparseRowOpInputs<T>> {
- protected:
-  void SetUp() override {}
-
-  void TearDown() override {}
+/** Wrapper to call csr_row_op because the enclosing function of a __device__
+ *  lambda cannot have private ot protected access within the class. */
+template <typename Type_f, typename Index_>
+void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
+                        Type_f *result, cudaStream_t stream) {
+  op::csr_row_op<Index_, 32>(
+    row_ind, n_rows, nnz,
+    [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {
+      for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row;
+    },
+    stream);
+}
 
+template <typename Type_f, typename Index_>
+class CSRRowOpTest
+  : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
  protected:
-  SparseRowOpInputs<T> params;
-};
-
-const std::vector<SparseRowOpInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
+    cudaStreamCreate(&stream);
+    n_rows = params.ex_scan.size();
+    nnz = params.verify.size();
+
+    raft::allocate(verify, nnz);
+    raft::allocate(ex_scan, n_rows);
+    raft::allocate(result, nnz, true);
+  }
+
+  void Run() {
+    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
+    raft::update_device(verify, params.verify.data(), nnz, stream);
+
+    csr_row_op_wrapper<Type_f, Index_>(ex_scan, n_rows, nnz, result, stream);
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+  }
+
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(ex_scan));
+    CUDA_CHECK(cudaFree(verify));
+    CUDA_CHECK(cudaFree(result));
+    cudaStreamDestroy(stream);
+  }
 
-typedef SparseRowOpTest<float> CSRRowOpTest;
-TEST_P(CSRRowOpTest, Result) {
+ protected:
+  CSRRowOpInputs<Type_f, Index_> params;
   cudaStream_t stream;
-  cudaStreamCreate(&stream);
-
-  int *ex_scan;
-  float *result, *verify;
-
-  int ex_scan_h[4] = {0, 4, 8, 9};
-
-  float verify_h[10] = {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0};
-
-  raft::allocate(verify, 10);
-  raft::allocate(ex_scan, 4);
-  raft::allocate(result, 10, true);
-
-  raft::update_device(ex_scan, *&ex_scan_h, 4, stream);
-  raft::update_device(verify, *&verify_h, 10, stream);
-
-  op::csr_row_op<int, 32>(
-    ex_scan, 4, 10,
-    [result] __device__(int row, int start_idx, int stop_idx) {
-      for (int i = start_idx; i < stop_idx; i++) result[i] = row;
-    },
-    stream);
+  Index_ n_rows, nnz;
+  Index_ *ex_scan;
+  Type_f *result, *verify;
+};
 
-  ASSERT_TRUE(
-    raft::devArrMatch<float>(verify, result, 10, raft::Compare<float>()));
+using CSRRowOpTestF = CSRRowOpTest<float, int>;
+TEST_P(CSRRowOpTestF, Result) { Run(); }
 
-  cudaStreamDestroy(stream);
+using CSRRowOpTestD = CSRRowOpTest<double, int>;
+TEST_P(CSRRowOpTestD, Result) { Run(); }
 
-  CUDA_CHECK(cudaFree(ex_scan));
-  CUDA_CHECK(cudaFree(verify));
-  CUDA_CHECK(cudaFree(result));
-}
+const std::vector<CSRRowOpInputs<float, int>> csrrowop_inputs_f = {
+  {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
+};
+const std::vector<CSRRowOpInputs<double, int>> csrrowop_inputs_d = {
+  {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
+};
 
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTest,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF,
+                        ::testing::ValuesIn(csrrowop_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD,
+                        ::testing::ValuesIn(csrrowop_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu
index 2fd0058691..01e83d4f0a 100644
--- a/cpp/test/sg/dbscan_test.cu
+++ b/cpp/test/sg/dbscan_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,11 @@ using namespace Datasets;
 using namespace Metrics;
 using namespace std;
 
+// Note: false negatives are theoretically possible, given that border
+// points are ambiguous.
+// If test failures are observed, these tests might need to be re-written
+// (cf how the Python tests work).
+
 template <typename T, typename IdxT>
 struct DbscanInputs {
   IdxT n_row;
@@ -82,8 +87,8 @@ class DbscanTest : public ::testing::TestWithParam<DbscanInputs<T, IdxT>> {
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    dbscanFit(handle, out.data(), params.n_row, params.n_col, params.eps,
-              params.min_pts, labels, nullptr, params.max_bytes_per_batch);
+    Dbscan::fit(handle, out.data(), params.n_row, params.n_col, params.eps,
+                params.min_pts, labels, nullptr, params.max_bytes_per_batch);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
@@ -197,8 +202,8 @@ class Dbscan2DSimple : public ::testing::TestWithParam<DBScan2DArrayInputs<T>> {
     raft::copy(labels_ref, params.out, params.n_out, handle.get_stream());
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    dbscanFit(handle, inputs, (int)params.n_row, 2, params.eps, params.min_pts,
-              labels, core_sample_indices_d);
+    Dbscan::fit(handle, inputs, (int)params.n_row, 2, params.eps,
+                params.min_pts, labels, core_sample_indices_d);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
diff --git a/docs/source/api.rst b/docs/source/api.rst
index c2dc582139..b173c68727 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -417,6 +417,12 @@ ARIMA
 Multi-Node, Multi-GPU Algorithms
 ================================
 
+DBSCAN Clustering
+--------------------
+
+.. autoclass:: cuml.dask.cluster.DBSCAN
+    :members:
+
 K-Means Clustering
 --------------------
 
diff --git a/python/cuml/cluster/__init__.py b/python/cuml/cluster/__init__.py
index 8a95580bb9..de89db74ba 100644
--- a/python/cuml/cluster/__init__.py
+++ b/python/cuml/cluster/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index 4222c56372..f07d34ea1e 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,51 +35,56 @@ from cuml.common.array_descriptor import CumlArrayDescriptor
 
 from collections import defaultdict
 
-cdef extern from "cuml/cluster/dbscan.hpp" namespace "ML":
-
-    cdef void dbscanFit(handle_t& handle,
-                        float *input,
-                        int n_rows,
-                        int n_cols,
-                        float eps,
-                        int min_pts,
-                        int *labels,
-                        int *core_sample_indices,
-                        size_t max_mbytes_per_batch,
-                        int verbosity) except +
-
-    cdef void dbscanFit(handle_t& handle,
-                        double *input,
-                        int n_rows,
-                        int n_cols,
-                        double eps,
-                        int min_pts,
-                        int *labels,
-                        int *core_sample_indices,
-                        size_t max_mbytes_per_batch,
-                        int verbosity) except +
-
-    cdef void dbscanFit(handle_t& handle,
-                        float *input,
-                        int64_t n_rows,
-                        int64_t n_cols,
-                        double eps,
-                        int min_pts,
-                        int64_t *labels,
-                        int64_t *core_sample_indices,
-                        size_t max_mbytes_per_batch,
-                        int verbosity) except +
-
-    cdef void dbscanFit(handle_t& handle,
-                        double *input,
-                        int64_t n_rows,
-                        int64_t n_cols,
-                        double eps,
-                        int min_pts,
-                        int64_t *labels,
-                        int64_t *core_sample_indices,
-                        size_t max_mbytes_per_batch,
-                        int verbosity) except +
+cdef extern from "cuml/cluster/dbscan.hpp" \
+        namespace "ML::Dbscan":
+
+    cdef void fit(handle_t& handle,
+                  float *input,
+                  int n_rows,
+                  int n_cols,
+                  float eps,
+                  int min_pts,
+                  int *labels,
+                  int *core_sample_indices,
+                  size_t max_mbytes_per_batch,
+                  int verbosity,
+                  bool opg) except +
+
+    cdef void fit(handle_t& handle,
+                  double *input,
+                  int n_rows,
+                  int n_cols,
+                  double eps,
+                  int min_pts,
+                  int *labels,
+                  int *core_sample_indices,
+                  size_t max_mbytes_per_batch,
+                  int verbosity,
+                  bool opg) except +
+
+    cdef void fit(handle_t& handle,
+                  float *input,
+                  int64_t n_rows,
+                  int64_t n_cols,
+                  double eps,
+                  int min_pts,
+                  int64_t *labels,
+                  int64_t *core_sample_indices,
+                  size_t max_mbytes_per_batch,
+                  int verbosity,
+                  bool opg) except +
+
+    cdef void fit(handle_t& handle,
+                  double *input,
+                  int64_t n_rows,
+                  int64_t n_cols,
+                  double eps,
+                  int min_pts,
+                  int64_t *labels,
+                  int64_t *core_sample_indices,
+                  size_t max_mbytes_per_batch,
+                  int verbosity,
+                  bool opg) except +
 
 
 class DBSCAN(Base):
@@ -210,17 +215,10 @@ class DBSCAN(Base):
         if self.max_mbytes_per_batch is None:
             self.max_mbytes_per_batch = 0
 
-    @generate_docstring(skip_parameters_heading=True)
-    def fit(self, X, out_dtype="int32") -> "DBSCAN":
+    def _fit(self, X, out_dtype, opg) -> "DBSCAN":
         """
-        Perform DBSCAN clustering from features.
-
-        Parameters
-        ----------
-        out_dtype: dtype Determines the precision of the output labels array.
-            default: "int32". Valid values are { "int32", np.int32,
-            "int64", np.int64}.
-
+        Protected auxiliary function for `fit`. Takes an additional parameter
+        opg that is set to `False` for SG, `True` for OPG (multi-GPU)
         """
         if out_dtype not in ["int32", np.int32, "int64", np.int64]:
             raise ValueError("Invalid value for out_dtype. "
@@ -247,54 +245,58 @@ class DBSCAN(Base):
             core_sample_indices_ptr = self.core_sample_indices_.ptr
 
         if self.dtype == np.float32:
-            if out_dtype is "int32" or out_dtype is np.int32:
-                dbscanFit(handle_[0],
-                          <float*>input_ptr,
-                          <int> n_rows,
-                          <int> n_cols,
-                          <float> self.eps,
-                          <int> self.min_samples,
-                          <int*> labels_ptr,
-                          <int*> core_sample_indices_ptr,
-                          <size_t>self.max_mbytes_per_batch,
-                          <int> self.verbose)
+            if out_dtype == "int32" or out_dtype is np.int32:
+                fit(handle_[0],
+                    <float*>input_ptr,
+                    <int> n_rows,
+                    <int> n_cols,
+                    <float> self.eps,
+                    <int> self.min_samples,
+                    <int*> labels_ptr,
+                    <int*> core_sample_indices_ptr,
+                    <size_t>self.max_mbytes_per_batch,
+                    <int> self.verbose,
+                    <bool> opg)
             else:
-                dbscanFit(handle_[0],
-                          <float*>input_ptr,
-                          <int64_t> n_rows,
-                          <int64_t> n_cols,
-                          <float> self.eps,
-                          <int> self.min_samples,
-                          <int64_t*> labels_ptr,
-                          <int64_t*> core_sample_indices_ptr,
-                          <size_t>self.max_mbytes_per_batch,
-                          <int> self.verbose)
+                fit(handle_[0],
+                    <float*>input_ptr,
+                    <int64_t> n_rows,
+                    <int64_t> n_cols,
+                    <float> self.eps,
+                    <int> self.min_samples,
+                    <int64_t*> labels_ptr,
+                    <int64_t*> core_sample_indices_ptr,
+                    <size_t>self.max_mbytes_per_batch,
+                    <int> self.verbose,
+                    <bool> opg)
 
         else:
-            if out_dtype is "int32" or out_dtype is np.int32:
-                dbscanFit(handle_[0],
-                          <double*>input_ptr,
-                          <int> n_rows,
-                          <int> n_cols,
-                          <double> self.eps,
-                          <int> self.min_samples,
-                          <int*> labels_ptr,
-                          <int*> core_sample_indices_ptr,
-                          <size_t> self.max_mbytes_per_batch,
-                          <int> self.verbose)
+            if out_dtype == "int32" or out_dtype is np.int32:
+                fit(handle_[0],
+                    <double*>input_ptr,
+                    <int> n_rows,
+                    <int> n_cols,
+                    <double> self.eps,
+                    <int> self.min_samples,
+                    <int*> labels_ptr,
+                    <int*> core_sample_indices_ptr,
+                    <size_t> self.max_mbytes_per_batch,
+                    <int> self.verbose,
+                    <bool> opg)
             else:
-                dbscanFit(handle_[0],
-                          <double*>input_ptr,
-                          <int64_t> n_rows,
-                          <int64_t> n_cols,
-                          <double> self.eps,
-                          <int> self.min_samples,
-                          <int64_t*> labels_ptr,
-                          <int64_t*> core_sample_indices_ptr,
-                          <size_t> self.max_mbytes_per_batch,
-                          <int> self.verbose)
-
-        # make sure that the `dbscanFit` is complete before the following
+                fit(handle_[0],
+                    <double*>input_ptr,
+                    <int64_t> n_rows,
+                    <int64_t> n_cols,
+                    <double> self.eps,
+                    <int> self.min_samples,
+                    <int64_t*> labels_ptr,
+                    <int64_t*> core_sample_indices_ptr,
+                    <size_t> self.max_mbytes_per_batch,
+                    <int> self.verbose,
+                    <bool> opg)
+
+        # make sure that the `fit` is complete before the following
         # delete call happens
         self.handle.sync()
         del(X_m)
@@ -320,6 +322,20 @@ class DBSCAN(Base):
 
         return self
 
+    @generate_docstring(skip_parameters_heading=True)
+    def fit(self, X, out_dtype="int32") -> "DBSCAN":
+        """
+        Perform DBSCAN clustering from features.
+
+        Parameters
+        ----------
+        out_dtype: dtype Determines the precision of the output labels array.
+            default: "int32". Valid values are { "int32", np.int32,
+            "int64", np.int64}.
+
+        """
+        return self._fit(X, out_dtype, False)
+
     @generate_docstring(skip_parameters_heading=True,
                         return_values={'name': 'preds',
                                        'type': 'dense',
diff --git a/python/cuml/cluster/dbscan_mg.pyx b/python/cuml/cluster/dbscan_mg.pyx
new file mode 100644
index 0000000000..3786e68440
--- /dev/null
+++ b/python/cuml/cluster/dbscan_mg.pyx
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# distutils: language = c++
+
+from cuml.cluster import DBSCAN
+from cuml.common.doc_utils import generate_docstring
+
+
+class DBSCANMG(DBSCAN):
+    """
+    A Multi-Node Multi-GPU implementation of DBSCAN
+    NOTE: This implementation of DBSCAN is meant to be used with an
+    initialized cumlCommunicator instance inside an existing distributed
+    system. Refer to the Dask DBSCAN implementation in
+    `cuml.dask.cluster.dbscan`.
+    """
+
+    def __init__(self, **kwargs):
+        super(DBSCANMG, self).__init__(**kwargs)
+
+    @generate_docstring(skip_parameters_heading=True)
+    def fit(self, X, out_dtype="int32") -> "DBSCANMG":
+        """
+        Perform DBSCAN clustering in a multi-node multi-GPU setting.
+        Parameters
+        ----------
+        out_dtype: dtype Determines the precision of the output labels array.
+            default: "int32". Valid values are { "int32", np.int32,
+            "int64", np.int64}.
+        """
+        return self._fit(X, out_dtype, True)
diff --git a/python/cuml/dask/cluster/__init__.py b/python/cuml/dask/cluster/__init__.py
index b444df31da..59181c8a2c 100644
--- a/python/cuml/dask/cluster/__init__.py
+++ b/python/cuml/dask/cluster/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,4 +14,5 @@
 # limitations under the License.
 #
 
+from cuml.dask.cluster.dbscan import DBSCAN
 from cuml.dask.cluster.kmeans import KMeans
diff --git a/python/cuml/dask/cluster/dbscan.py b/python/cuml/dask/cluster/dbscan.py
new file mode 100644
index 0000000000..0199566b28
--- /dev/null
+++ b/python/cuml/dask/cluster/dbscan.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+from cuml.dask.common.base import BaseEstimator
+from cuml.dask.common.base import DelayedPredictionMixin
+from cuml.dask.common.base import DelayedTransformMixin
+from cuml.dask.common.base import mnmg_import
+
+from cuml.raft.dask.common.comms import Comms
+from cuml.raft.dask.common.comms import get_raft_comm_state
+
+from cuml.dask.common.utils import wait_and_raise_from_futures
+
+from cuml.common.memory_utils import with_cupy_rmm
+
+
+class DBSCAN(BaseEstimator, DelayedPredictionMixin, DelayedTransformMixin):
+    """
+    Multi-Node Multi-GPU implementation of DBSCAN.
+
+    The whole dataset is copied to all the workers but the work is then
+    divided by giving "ownership" of a subset to each worker: each worker
+    computes a clustering by considering the relationships between those
+    points and the rest of the dataset, and partial results are merged at
+    the end to obtain the final clustering.
+
+    Parameters
+    ----------
+    client : dask.distributed.Client
+        Dask client to use
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
+    min_samples : int (default = 5)
+        The number of samples in a neighborhood such that this group can be
+        considered as an important core point (including the point itself).
+    max_mbytes_per_batch : (optional) int64
+        Calculate batch size using no more than this number of megabytes for
+        the pairwise distance computation. This enables the trade-off between
+        runtime and memory usage for making the N^2 pairwise distance
+        computations more tractable for large numbers of samples.
+        If you are experiencing out of memory errors when running DBSCAN, you
+        can set this value based on the memory size of your device.
+        Note: this option does not set the maximum total memory used in the
+        DBSCAN computation and so this value will not be able to be set to
+        the total memory available on the device.
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
+        Variable to control output type of the results and attributes of
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_output_type`.
+        See :ref:`output-data-type-configuration` for more info.
+    calc_core_sample_indices : (optional) boolean (default = True)
+        Indicates whether the indices of the core samples should be calculated.
+        The the attribute `core_sample_indices_` will not be used, setting this
+        to False will avoid unnecessary kernel launches
+
+    Notes
+    ------
+    For additional docs, see the documentation of the single-GPU DBSCAN model
+    """
+
+    def __init__(self, client=None, verbose=False, **kwargs):
+        super(DBSCAN, self).__init__(client=client,
+                                     verbose=verbose,
+                                     **kwargs)
+
+    @staticmethod
+    @mnmg_import
+    def _func_fit(out_dtype):
+        def _func(sessionId, data, verbose, **kwargs):
+            from cuml.cluster.dbscan_mg import DBSCANMG as cumlDBSCAN
+            handle = get_raft_comm_state(sessionId)["handle"]
+
+            return cumlDBSCAN(handle=handle, verbose=verbose, **kwargs
+                              ).fit(data, out_dtype=out_dtype)
+        return _func
+
+    @with_cupy_rmm
+    def fit(self, X, out_dtype="int32"):
+        """
+        Fit a multi-node multi-GPU DBSCAN model
+
+        Parameters
+        ----------
+        X : array-like (device or host)
+            Dense matrix containing floats or doubles.
+            Acceptable formats: CUDA array interface compliant objects like
+            CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas
+            DataFrame/Series.
+        out_dtype: dtype Determines the precision of the output labels array.
+            default: "int32". Valid values are { "int32", np.int32,
+            "int64", np.int64}.
+        """
+        if out_dtype not in ["int32", np.int32, "int64", np.int64]:
+            raise ValueError("Invalid value for out_dtype. "
+                             "Valid values are {'int32', 'int64', "
+                             "np.int32, np.int64}")
+
+        data = self.client.scatter(X, broadcast=True)
+
+        comms = Comms(comms_p2p=True)
+        comms.init()
+
+        dbscan_fit = [self.client.submit(DBSCAN._func_fit(out_dtype),
+                                         comms.sessionId,
+                                         data,
+                                         self.verbose,
+                                         **self.kwargs,
+                                         workers=[worker],
+                                         pure=False)
+                      for worker in comms.worker_addresses]
+
+        wait_and_raise_from_futures(dbscan_fit)
+
+        comms.destroy()
+
+        self._set_internal_model(dbscan_fit[0])
+
+        return self
+
+    def fit_predict(self, X, out_dtype="int32"):
+        """
+        Performs clustering on X and returns cluster labels.
+
+        Parameters
+        ----------
+        X : array-like (device or host)
+            Dense matrix containing floats or doubles.
+            Acceptable formats: CUDA array interface compliant objects like
+            CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas
+            DataFrame/Series.
+        out_dtype: dtype Determines the precision of the output labels array.
+            default: "int32". Valid values are { "int32", np.int32,
+            "int64", np.int64}.
+        Returns
+        -------
+        labels: array-like (device or host)
+            Integer array of labels
+        """
+        self.fit(X, out_dtype)
+        return self.get_combined_model().labels_
+
+    def get_param_names(self):
+        return list(self.kwargs.keys())
diff --git a/python/cuml/test/dask/test_dbscan.py b/python/cuml/test/dask/test_dbscan.py
new file mode 100644
index 0000000000..726d1e6594
--- /dev/null
+++ b/python/cuml/test/dask/test_dbscan.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from cuml.test.utils import get_pattern, unit_param, \
+    quality_param, stress_param, array_equal, assert_dbscan_equal
+
+from sklearn.cluster import DBSCAN as skDBSCAN
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
+@pytest.mark.parametrize('datatype', [np.float32, np.float64])
+@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
+                         stress_param(500000)])
+@pytest.mark.parametrize('ncols', [unit_param(20), quality_param(100),
+                         stress_param(1000)])
+@pytest.mark.parametrize('out_dtype', [unit_param("int32"),
+                                       unit_param(np.int32),
+                                       unit_param("int64"),
+                                       unit_param(np.int64),
+                                       quality_param("int32"),
+                                       stress_param("int32")])
+def test_dbscan(datatype, nrows, ncols,
+                max_mbytes_per_batch, out_dtype, client):
+    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
+
+    n_samples = nrows
+    n_feats = ncols
+    X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
+                      n_features=n_feats, random_state=0)
+
+    eps = 1
+    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2,
+                           max_mbytes_per_batch=max_mbytes_per_batch,
+                           output_type='numpy')
+
+    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)
+
+    if nrows < 500000:
+        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
+        sk_labels = sk_dbscan.fit_predict(X)
+
+        # Check the core points are equal
+        assert array_equal(cuml_dbscan.core_sample_indices_,
+                           sk_dbscan.core_sample_indices_)
+
+        # Check the labels are correct
+        assert_dbscan_equal(sk_labels, cu_labels, X,
+                            cuml_dbscan.core_sample_indices_, eps)
+
+    if out_dtype == "int32" or out_dtype == np.int32:
+        assert cu_labels.dtype == np.int32
+    elif out_dtype == "int64" or out_dtype == np.int64:
+        assert cu_labels.dtype == np.int64
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("name", [
+                                 'noisy_moons',
+                                 'blobs',
+                                 'no_structure'])
+@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
+                         stress_param(500000)])
+# Vary the eps to get a range of core point counts
+@pytest.mark.parametrize('eps', [0.05, 0.1, 0.5])
+def test_dbscan_sklearn_comparison(name, nrows, eps, client):
+    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
+
+    default_base = {'quantile': .2,
+                    'eps': eps,
+                    'damping': .9,
+                    'preference': -200,
+                    'n_neighbors': 10,
+                    'n_clusters': 2}
+
+    n_samples = nrows
+    pat = get_pattern(name, n_samples)
+    params = default_base.copy()
+    params.update(pat[1])
+    X, y = pat[0]
+
+    X = StandardScaler().fit_transform(X)
+
+    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5,
+                           output_type='numpy')
+    cu_labels = cuml_dbscan.fit_predict(X)
+
+    if nrows < 500000:
+        sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
+        sk_labels = sk_dbscan.fit_predict(X)
+
+        assert_dbscan_equal(sk_labels, cu_labels, X,
+                            cuml_dbscan.core_sample_indices_, eps)
+
+        # Check the core points are equal
+        assert array_equal(cuml_dbscan.core_sample_indices_,
+                           sk_dbscan.core_sample_indices_)
+
+        # Check the labels are correct
+        assert_dbscan_equal(sk_labels, cu_labels, X,
+                            cuml_dbscan.core_sample_indices_, eps)
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("name", [
+                                 'noisy_moons',
+                                 'blobs',
+                                 'no_structure'])
+def test_dbscan_default(name, client):
+    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
+
+    eps = 0.5
+    default_base = {'quantile': .3,
+                    'eps': eps,
+                    'damping': .9,
+                    'preference': -200,
+                    'n_neighbors': 10,
+                    'n_clusters': 2}
+    n_samples = 500
+    pat = get_pattern(name, n_samples)
+    params = default_base.copy()
+    params.update(pat[1])
+    X, y = pat[0]
+
+    X = StandardScaler().fit_transform(X)
+
+    cuml_dbscan = cuDBSCAN(output_type='numpy')
+    cu_labels = cuml_dbscan.fit_predict(X)
+
+    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
+    sk_labels = sk_dbscan.fit_predict(X)
+
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
+
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, eps)
+
+
+@pytest.mark.mg
+@pytest.mark.xfail(strict=True, raises=ValueError)
+def test_dbscan_out_dtype_fails_invalid_input(client):
+    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
+
+    X, _ = make_blobs(n_samples=500)
+
+    cuml_dbscan = cuDBSCAN(output_type='numpy')
+    cuml_dbscan.fit_predict(X, out_dtype="bad_input")
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize('datatype', [np.float32, np.float64])
+@pytest.mark.parametrize('out_dtype', ["int32", np.int32, "int64", np.int64])
+def test_dbscan_propagation(datatype, out_dtype, client):
+    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
+
+    X, y = make_blobs(5000, centers=1, cluster_std=8.0,
+                      center_box=(-100.0, 100.0), random_state=8)
+    X = X.astype(datatype)
+
+    eps = 0.5
+    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5,
+                           output_type='numpy')
+    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)
+
+    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
+    sk_labels = sk_dbscan.fit_predict(X)
+
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
+
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, eps)
+
+
+@pytest.mark.mg
+def test_dbscan_no_calc_core_point_indices(client):
+    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
+
+    params = {'eps': 1.1, 'min_samples': 4}
+    n_samples = 1000
+    pat = get_pattern("noisy_moons", n_samples)
+
+    X, y = pat[0]
+
+    X = StandardScaler().fit_transform(X)
+
+    # Set calc_core_sample_indices=False
+    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5,
+                           output_type='numpy', calc_core_sample_indices=False)
+    cuml_dbscan.fit_predict(X)
+
+    # Make sure we are None
+    assert(cuml_dbscan.core_sample_indices_ is None)
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index cc2fc953a5..5f6bbbce67 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,18 +19,14 @@
 from cuml.test.utils import get_handle
 from cuml import DBSCAN as cuDBSCAN
 from cuml.test.utils import get_pattern, unit_param, \
-    quality_param, stress_param, array_equal
+    quality_param, stress_param, array_equal, assert_dbscan_equal
 
 from sklearn.cluster import DBSCAN as skDBSCAN
 from sklearn.datasets import make_blobs
 from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import adjusted_rand_score
 
-dataset_names = ['noisy_moons', 'varied', 'aniso', 'blobs',
-                 'noisy_circles', 'no_structure']
 
-
-@pytest.mark.parametrize('max_mbytes_per_batch', [1e9, 5e9])
+@pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None])
 @pytest.mark.parametrize('datatype', [np.float32, np.float64])
 @pytest.mark.parametrize('use_handle', [True, False])
 @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
@@ -52,17 +48,24 @@ def test_dbscan(datatype, use_handle, nrows, ncols,
 
     handle, stream = get_handle(use_handle)
 
-    cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2,
-                        max_mbytes_per_batch=max_mbytes_per_batch,
-                        output_type='numpy', calc_core_sample_indices=False)
+    eps = 1
+    cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2,
+                           max_mbytes_per_batch=max_mbytes_per_batch,
+                           output_type='numpy')
 
-    cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)
+    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)
 
     if nrows < 500000:
-        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
-        sk_labels = skdbscan.fit_predict(X)
-        score = adjusted_rand_score(cu_labels, sk_labels)
-        assert score == 1
+        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
+        sk_labels = sk_dbscan.fit_predict(X)
+
+        # Check the core points are equal
+        assert array_equal(cuml_dbscan.core_sample_indices_,
+                           sk_dbscan.core_sample_indices_)
+
+        # Check the labels are correct
+        assert_dbscan_equal(sk_labels, cu_labels, X,
+                            cuml_dbscan.core_sample_indices_, eps)
 
     if out_dtype == "int32" or out_dtype == np.int32:
         assert cu_labels.dtype == np.int32
@@ -93,19 +96,21 @@ def test_dbscan_sklearn_comparison(name, nrows, eps):
 
     X = StandardScaler().fit_transform(X)
 
-    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5,
+    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5,
                            output_type='numpy')
-    cu_y_pred = cuml_dbscan.fit_predict(X)
+    cu_labels = cuml_dbscan.fit_predict(X)
 
     if nrows < 500000:
-        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
-        sk_y_pred = dbscan.fit_predict(X)
-        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
-        assert(score == 1.0)
+        sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
+        sk_labels = sk_dbscan.fit_predict(X)
 
         # Check the core points are equal
-        array_equal(cuml_dbscan.core_sample_indices_,
-                    dbscan.core_sample_indices_)
+        assert array_equal(cuml_dbscan.core_sample_indices_,
+                           sk_dbscan.core_sample_indices_)
+
+        # Check the labels are correct
+        assert_dbscan_equal(sk_labels, cu_labels, X,
+                            cuml_dbscan.core_sample_indices_, eps)
 
 
 @pytest.mark.parametrize("name", [
@@ -128,21 +133,26 @@ def test_dbscan_default(name):
     X = StandardScaler().fit_transform(X)
 
     cuml_dbscan = cuDBSCAN(output_type='numpy')
-    cu_y_pred = cuml_dbscan.fit_predict(X)
+    cu_labels = cuml_dbscan.fit_predict(X)
+
+    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
+    sk_labels = sk_dbscan.fit_predict(X)
 
-    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
-    sk_y_pred = dbscan.fit_predict(X)
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
 
-    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
-    assert(score == 1.0)
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, params['eps'])
 
 
 @pytest.mark.xfail(strict=True, raises=ValueError)
 def test_dbscan_out_dtype_fails_invalid_input():
     X, _ = make_blobs(n_samples=500)
 
-    cudbscan = cuDBSCAN(output_type='numpy')
-    cudbscan.fit_predict(X, out_dtype="bad_input")
+    cuml_dbscan = cuDBSCAN(output_type='numpy')
+    cuml_dbscan.fit_predict(X, out_dtype="bad_input")
 
 
 def test_core_point_prop1():
@@ -166,14 +176,19 @@ def test_core_point_prop1():
         [3, 0],
         [4, 0]
     ], dtype=np.float32)
-    cudbscan = cuDBSCAN(**params)
-    cu_y_pred = cudbscan.fit_predict(X)
+    cuml_dbscan = cuDBSCAN(**params)
+    cu_labels = cuml_dbscan.fit_predict(X)
+
+    sk_dbscan = skDBSCAN(**params)
+    sk_labels = sk_dbscan.fit_predict(X)
 
-    dbscan = skDBSCAN(**params)
-    sk_y_pred = dbscan.fit_predict(X)
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
 
-    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
-    assert(score == 1.0)
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, params['eps'])
 
 
 def test_core_point_prop2():
@@ -199,14 +214,19 @@ def test_core_point_prop2():
         [4, -1],
         [5, 0]
     ], dtype=np.float32)
-    cudbscan = cuDBSCAN(**params)
-    cu_y_pred = cudbscan.fit_predict(X)
+    cuml_dbscan = cuDBSCAN(**params)
+    cu_labels = cuml_dbscan.fit_predict(X)
 
-    dbscan = skDBSCAN(**params)
-    sk_y_pred = dbscan.fit_predict(X)
+    sk_dbscan = skDBSCAN(**params)
+    sk_labels = sk_dbscan.fit_predict(X)
 
-    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
-    assert(score == 1.0)
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
+
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, params['eps'])
 
 
 def test_core_point_prop3():
@@ -224,6 +244,8 @@ def test_core_point_prop3():
     # as it will depend on the order in which we process the core-points.
     # So we exclude that point from the comparison with sklearn
 
+    # TODO: the above text does not correspond to the actual test!
+
     X = np.array([
         [0, 0],
         [1, 0],
@@ -236,14 +258,19 @@ def test_core_point_prop3():
         [5, 0],
         [2, 0]
     ], dtype=np.float32)
-    cudbscan = cuDBSCAN(**params)
-    cu_y_pred = cudbscan.fit_predict(X)
+    cuml_dbscan = cuDBSCAN(**params)
+    cu_labels = cuml_dbscan.fit_predict(X)
 
-    dbscan = skDBSCAN(**params)
-    sk_y_pred = dbscan.fit_predict(X)
+    sk_dbscan = skDBSCAN(**params)
+    sk_labels = sk_dbscan.fit_predict(X)
 
-    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
-    assert(score == 1.0)
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
+
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, params['eps'])
 
 
 @pytest.mark.parametrize('datatype', [np.float32, np.float64])
@@ -255,15 +282,21 @@ def test_dbscan_propagation(datatype, use_handle, out_dtype):
     X = X.astype(datatype)
 
     handle, stream = get_handle(use_handle)
-    cuml_dbscan = cuDBSCAN(handle=handle, eps=0.5, min_samples=5,
+    eps = 0.5
+    cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=5,
                            output_type='numpy')
-    cu_y_pred = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)
+    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)
+
+    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
+    sk_labels = sk_dbscan.fit_predict(X)
 
-    dbscan = skDBSCAN(eps=0.5, min_samples=5)
-    sk_y_pred = dbscan.fit_predict(X)
+    # Check the core points are equal
+    assert array_equal(cuml_dbscan.core_sample_indices_,
+                       sk_dbscan.core_sample_indices_)
 
-    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
-    assert(score == 1.0)
+    # Check the labels are correct
+    assert_dbscan_equal(sk_labels, cu_labels, X,
+                        cuml_dbscan.core_sample_indices_, eps)
 
 
 def test_dbscan_no_calc_core_point_indices():
@@ -279,13 +312,7 @@ def test_dbscan_no_calc_core_point_indices():
     # Set calc_core_sample_indices=False
     cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5,
                            output_type='numpy', calc_core_sample_indices=False)
-    cu_y_pred = cuml_dbscan.fit_predict(X)
-
-    dbscan = skDBSCAN(**params)
-    sk_y_pred = dbscan.fit_predict(X)
-
-    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
-    assert(score == 1.0)
+    cuml_dbscan.fit_predict(X)
 
     # Make sure we are None
     assert(cuml_dbscan.core_sample_indices_ is None)
diff --git a/python/cuml/test/utils.py b/python/cuml/test/utils.py
index 56df8ba938..e1a5115b41 100644
--- a/python/cuml/test/utils.py
+++ b/python/cuml/test/utils.py
@@ -129,6 +129,45 @@ def clusters_equal(a0, b0, n_clusters, tol=1e-4):
     return array_equal(a, b, total_tol=tol)
 
 
+def assert_dbscan_equal(ref, actual, X, core_indices, eps):
+    """
+    Utility function to compare two numpy label arrays.
+    The labels of core/noise points are expected to be equal, and the labels
+    of border points are verified by finding a neighboring core point with the
+    same label.
+    """
+    core_set = set(core_indices)
+    N, _ = X.shape
+    eps2 = eps**2
+
+    def sqnorm(x):
+        return np.inner(x, x)
+
+    for i in range(N):
+        la, lb = ref[i], actual[i]
+
+        if i in core_set:  # core point
+            assert la == lb, ("Core point mismatch at #{}: "
+                              "{} (expected {})".format(i, lb, la))
+        elif la == -1:  # noise point
+            assert lb == -1, "Noise mislabelled at #{}: {}".format(i, lb)
+        else:  # border point
+            found = False
+            for j in range(N):
+                # Check if j is a core point with the same label
+                if j in core_set and lb == actual[j]:
+                    # Check if j is a neighbor of i
+                    if sqnorm(X[i] - X[j]) <= eps2:
+                        found = True
+                        break
+            assert found, ("Border point not connected to cluster at #{}: "
+                           "{} (reference: {})".format(i, lb, la))
+
+    # Note: we can also do it in a rand score fashion by checking that pairs
+    # correspond in both label arrays for core points, if we need to drop the
+    # requirement of minimality for core points
+
+
 def get_handle(use_handle, n_streams=0):
     if not use_handle:
         return None, None

From fa2371a217cf8d13bc306c6e563c2e14f85eb4ad Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Tue, 2 Feb 2021 21:16:24 +0100
Subject: [PATCH 06/29] MNMG KNN consolidation (#3307)

Answers #3197

Currently, MNMG KNN and MNMG KNN Cl&Re use separate code. This PR intends to refactor the code on both the Python and C++ sides to use common code as much as possible. In addition, the functions in the C++ code currently have a very large number of arguments, making them unreadable to newcomers. This PR will work on reducing their number with the use of parameter structures like it's done elsewhere in cuML's codebase.

Authors:
  - Victor Lafargue (@viclafargue)

Approvers:
  - Corey J. Nolet (@cjnolet)
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/cuml/pull/3307
---
 cpp/CMakeLists.txt                            |    1 -
 cpp/include/cuml/neighbors/knn_mg.hpp         |   58 +-
 cpp/src/knn/knn_classify_mg.cu                |   77 +-
 cpp/src/knn/knn_mg.cu                         |  354 +-----
 cpp/src/knn/knn_opg_common.cu                 |  759 -------------
 cpp/src/knn/knn_opg_common.cuh                | 1003 +++++++++++++++--
 cpp/src/knn/knn_regress_mg.cu                 |   73 +-
 .../dask/neighbors/kneighbors_classifier.py   |   23 +-
 .../dask/neighbors/kneighbors_regressor.py    |   13 +-
 .../cuml/dask/neighbors/nearest_neighbors.py  |   39 +-
 .../neighbors/kneighbors_classifier_mg.pyx    |  163 ++-
 python/cuml/neighbors/kneighbors_mg.pyx       |  186 ---
 .../neighbors/kneighbors_regressor_mg.pyx     |   87 +-
 .../cuml/neighbors/nearest_neighbors_mg.pyx   |  382 ++++---
 .../cuml/test/dask/test_nearest_neighbors.py  |    3 +-
 15 files changed, 1330 insertions(+), 1891 deletions(-)
 delete mode 100644 cpp/src/knn/knn_opg_common.cu
 delete mode 100644 python/cuml/neighbors/kneighbors_mg.pyx

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2eacef46eb..bba25ddb0e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -442,7 +442,6 @@ if(BUILD_CUML_CPP_LIBRARY)
         src/glm/ridge_mg.cu
         src/kmeans/kmeans_mg.cu
         src/knn/knn_mg.cu
-        src/knn/knn_opg_common.cu
         src/knn/knn_classify_mg.cu
         src/knn/knn_regress_mg.cu
         src/pca/pca_mg.cu
diff --git a/cpp/include/cuml/neighbors/knn_mg.hpp b/cpp/include/cuml/neighbors/knn_mg.hpp
index ddc6ad2108..41d83e5586 100644
--- a/cpp/include/cuml/neighbors/knn_mg.hpp
+++ b/cpp/include/cuml/neighbors/knn_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,45 +30,37 @@ namespace KNN {
 namespace opg {
 
 /**
- * @brief Performs a multi-node multi-GPU brute force nearest neighbors.
- * @param handle: the raft::handle_t to use for managing resources
- * @param[out] out_I: vector of output index partitions. size should match the
+ * Performs a multi-node multi-GPU KNN.
+ * @param[in] handle the raft::handle_t to use for managing resources
+ * @param[out] out_I vector of output index partitions. size should match the
  *        number of local input partitions.
- * @param[out] out_D: vector of output distance partitions. size should match
+ * @param[out] out_D vector of output distance partitions. size should match
  *        the number of local input partitions.
- * @param[in] idx_data: vector of local indices to query
- * @param[in] idx_desc: describes how the index partitions are distributed
+ * @param[in] idx_data vector of local indices to query
+ * @param[in] idx_desc describes how the index partitions are distributed
  *        across the ranks.
- * @param[in] query_data: vector of local query partitions
- * @param[in] query_desc: describes how the query partitions are distributed
+ * @param[in] query_data vector of local query partitions
+ * @param[in] query_desc describes how the query partitions are distributed
  *        across the cluster.
- * @param[in] rowMajorIndex: are index vectors in row-major format?
- * @param[in] rowMajorQuery: are query vector in row-major format?
- * @param[in] k: the numeber of neighbors to query
- * @param[in] batch_size: the max number of rows to broadcast at a time
- * @param[in] verbose: print extra logging info
- *
+ * @param[in] rowMajorIndex boolean indicating whether the index is row major.
+ * @param[in] rowMajorQuery boolean indicating whether the query is row major.
+ * @param[in] k the number of neighbors to query
+ * @param[in] batch_size the max number of rows to broadcast at a time
+ * @param[in] verbose print extra logging info
  */
-void brute_force_knn(raft::handle_t &handle,
-                     std::vector<Matrix::Data<int64_t> *> &out_I,
-                     std::vector<Matrix::floatData_t *> &out_D,
-                     std::vector<Matrix::floatData_t *> &idx_data,
-                     Matrix::PartDescriptor &idx_desc,
-                     std::vector<Matrix::floatData_t *> &query_data,
-                     Matrix::PartDescriptor &query_desc,
-                     bool rowMajorIndex = false, bool rowMajorQuery = false,
-                     int k = 10, size_t batch_size = 1 << 15,
-                     bool verbose = false);
+void knn(raft::handle_t &handle, std::vector<Matrix::Data<int64_t> *> *out_I,
+         std::vector<Matrix::floatData_t *> *out_D,
+         std::vector<Matrix::floatData_t *> &idx_data,
+         Matrix::PartDescriptor &idx_desc,
+         std::vector<Matrix::floatData_t *> &query_data,
+         Matrix::PartDescriptor &query_desc, bool rowMajorIndex,
+         bool rowMajorQuery, int k, size_t batch_size, bool verbose);
 
 /**
  * Performs a multi-node multi-GPU KNN classify.
  * @param[in] handle the raft::handle_t to use for managing resources
  * @param[out] out vector of output labels partitions. size should match the
  *        number of local input partitions.
- * @param[out] out_I vector of output index partitions. size should match the
- *        number of local input partitions.
- * @param[out] out_D vector of output distance partitions. size should match
- *        the number of local input partitions.
  * @param[in] probas (optional) pointer to a vector containing arrays of probabilities
  * @param[in] idx_data vector of local indices to query
  * @param[in] idx_desc describes how the index partitions are distributed
@@ -89,8 +81,6 @@ void brute_force_knn(raft::handle_t &handle,
  * @param[in] verbose print extra logging info
  */
 void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
-                  std::vector<Matrix::Data<int64_t> *> *out_I,
-                  std::vector<Matrix::floatData_t *> *out_D,
                   std::vector<std::vector<float *>> *probas,
                   std::vector<Matrix::floatData_t *> &idx_data,
                   Matrix::PartDescriptor &idx_desc,
@@ -107,10 +97,6 @@ void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
  * @param[in] handle the raft::handle_t to use for managing resources
  * @param[out] out vector of output partitions. size should match the
  *        number of local input partitions.
- * @param[out] out_I vector of output index partitions. size should match the
- *        number of local input partitions.
- * @param[out] out_D vector of output distance partitions. size should match
- *        the number of local input partitions.
  * @param[in] idx_data vector of local indices to query
  * @param[in] idx_desc describes how the index partitions are distributed
  *        across the ranks.
@@ -129,8 +115,6 @@ void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
  */
 void knn_regress(raft::handle_t &handle,
                  std::vector<Matrix::Data<float> *> *out,
-                 std::vector<Matrix::Data<int64_t> *> *out_I,
-                 std::vector<Matrix::floatData_t *> *out_D,
                  std::vector<Matrix::floatData_t *> &idx_data,
                  Matrix::PartDescriptor &idx_desc,
                  std::vector<Matrix::floatData_t *> &query_data,
diff --git a/cpp/src/knn/knn_classify_mg.cu b/cpp/src/knn/knn_classify_mg.cu
index bf4ce30bd7..d03b6e275a 100644
--- a/cpp/src/knn/knn_classify_mg.cu
+++ b/cpp/src/knn/knn_classify_mg.cu
@@ -1,51 +1,18 @@
 /*
-* Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO LICENSEE:
-*
-* This source code and/or documentation ("Licensed Deliverables") are
-* subject to NVIDIA intellectual property rights under U.S. and
-* international Copyright laws.
-*
-* These Licensed Deliverables contained herein is PROPRIETARY and
-* CONFIDENTIAL to NVIDIA and is being provided under the terms and
-* conditions of a form of NVIDIA software license agreement by and
-* between NVIDIA and Licensee ("License Agreement") or electronically
-* accepted by Licensee.  Notwithstanding any terms or conditions to
-* the contrary in the License Agreement, reproduction or disclosure
-* of the Licensed Deliverables to any third party without the express
-* written consent of NVIDIA is prohibited.
-*
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
-* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THESE LICENSED DELIVERABLES.
-*
-* U.S. Government End Users.  These Licensed Deliverables are a
-* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-* 1995), consisting of "commercial computer software" and "commercial
-* computer software documentation" as such terms are used in 48
-* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
-* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-* U.S. Government End Users acquire the Licensed Deliverables with
-* only those rights set forth herein.
-*
-* Any use of the Licensed Deliverables in individual and commercial
-* software must include, in the user documentation and internal
-* comments to the code, the above Disclaimer and U.S. Government End
-* Users Notice.
-*/
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include "knn_opg_common.cuh"
 
@@ -55,9 +22,9 @@ namespace opg {
 
 using namespace knn_common;
 
+template struct KNN_CL_params<float, int64_t, float, int>;
+
 void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
-                  std::vector<Matrix::Data<int64_t> *> *out_I,
-                  std::vector<Matrix::floatData_t *> *out_D,
                   std::vector<std::vector<float *>> *probas,
                   std::vector<Matrix::floatData_t *> &idx_data,
                   Matrix::PartDescriptor &idx_desc,
@@ -67,11 +34,15 @@ void knn_classify(raft::handle_t &handle, std::vector<Matrix::Data<int> *> *out,
                   std::vector<int *> &uniq_labels, std::vector<int> &n_unique,
                   bool rowMajorIndex, bool rowMajorQuery, bool probas_only,
                   int k, size_t batch_size, bool verbose) {
-  int n_outputs = n_unique.size();
+  knn_operation knn_op =
+    probas_only ? knn_operation::class_proba : knn_operation::classification;
+  KNN_CL_params<float, int64_t, float, int> params(
+    knn_op, &idx_data, &idx_desc, &query_data, &query_desc, rowMajorIndex,
+    rowMajorQuery, k, batch_size, verbose, n_unique.size(), &y, &n_unique,
+    &uniq_labels, out, probas);
 
-  opg_knn(handle, out, out_I, out_D, idx_data, idx_desc, query_data, query_desc,
-          y, rowMajorIndex, rowMajorQuery, k, n_outputs, batch_size, verbose,
-          probas, &uniq_labels, &n_unique, probas_only);
+  cuda_utils cutils(handle);
+  opg_knn(params, cutils);
 }
 };  // namespace opg
 };  // namespace KNN
diff --git a/cpp/src/knn/knn_mg.cu b/cpp/src/knn/knn_mg.cu
index fa106605ee..871400aa12 100644
--- a/cpp/src/knn/knn_mg.cu
+++ b/cpp/src/knn/knn_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,352 +14,30 @@
  * limitations under the License.
  */
 
-#include <cuml/neighbors/knn_mg.hpp>
-#include <selection/knn.cuh>
-
-#include <common/cumlHandle.hpp>
-
-#include <common/device_buffer.hpp>
-#include <cuml/common/cuml_allocator.hpp>
-#include <raft/comms/comms.hpp>
-
-#include <set>
-
-#include <raft/cudart_utils.h>
-#include <raft/cuda_utils.cuh>
+#include "knn_opg_common.cuh"
 
 namespace ML {
 namespace KNN {
 namespace opg {
 
-void reduce(Matrix::Data<int64_t> *&out_I, Matrix::floatData_t *&out_D,
-            device_buffer<int64_t> &res_I, device_buffer<float> &res_D,
-            Matrix::PartDescriptor &index_desc,
-            const raft::comms::comms_t &comm,
-            std::shared_ptr<deviceAllocator> alloc, cudaStream_t stream,
-            size_t cur_batch_size, int k, int local_parts_completed,
-            int cur_batch, size_t total_n_processed, std::set<int> idxRanks) {
-  Matrix::Data<int64_t> *I = out_I;
-  Matrix::floatData_t *D = out_D;
-
-  size_t batch_offset = total_n_processed * k;
-
-  device_buffer<int64_t> trans(alloc, stream, idxRanks.size());
-  CUDA_CHECK(cudaMemsetAsync(trans.data(), 0, idxRanks.size() * sizeof(int64_t),
-                             stream));
-
-  MLCommon::Selection::knn_merge_parts(
-    res_D.data(), res_I.data(), D->ptr + batch_offset, I->ptr + batch_offset,
-    cur_batch_size, idxRanks.size(), k, stream, trans.data());
-}
-
-void perform_local_knn(int64_t *res_I, float *res_D,
-                       std::vector<Matrix::floatData_t *> &idx_data,
-                       Matrix::PartDescriptor &idx_desc,
-                       std::vector<Matrix::RankSizePair *> &local_idx_parts,
-                       std::vector<size_t> &start_indices, cudaStream_t stream,
-                       cudaStream_t *internal_streams, int n_internal_streams,
-                       std::shared_ptr<deviceAllocator> allocator,
-                       size_t cur_batch_size, int k, float *cur_query_ptr,
-                       bool rowMajorIndex, bool rowMajorQuery) {
-  std::vector<float *> ptrs(idx_data.size());
-  std::vector<int> sizes(idx_data.size());
+using namespace knn_common;
 
-  for (int cur_idx = 0; cur_idx < idx_data.size(); cur_idx++) {
-    ptrs[cur_idx] = idx_data[cur_idx]->ptr;
-    sizes[cur_idx] = local_idx_parts[cur_idx]->size;
-  }
+template struct KNN_params<float, int64_t, float, int>;
 
-  // PartDescriptor uses size_t while FAISS uses int64_t
-  // so we need to do a quick conversion.
-  std::vector<int64_t> start_indices_long;
-  for (size_t start_index : start_indices)
-    start_indices_long.push_back((int64_t)start_index);
+void knn(raft::handle_t &handle, std::vector<Matrix::Data<int64_t> *> *out_I,
+         std::vector<Matrix::floatData_t *> *out_D,
+         std::vector<Matrix::floatData_t *> &idx_data,
+         Matrix::PartDescriptor &idx_desc,
+         std::vector<Matrix::floatData_t *> &query_data,
+         Matrix::PartDescriptor &query_desc, bool rowMajorIndex,
+         bool rowMajorQuery, int k, size_t batch_size, bool verbose) {
+  KNN_params<float, int64_t, float, int> params(
+    knn_operation::knn, &idx_data, &idx_desc, &query_data, &query_desc,
+    rowMajorIndex, rowMajorQuery, k, batch_size, verbose, out_D, out_I);
 
-  // ID ranges need to be offset by each local partition's
-  // starting indices.
-  MLCommon::Selection::brute_force_knn(
-    ptrs, sizes, (int)idx_desc.N, cur_query_ptr, (int)cur_batch_size, res_I,
-    res_D, k, allocator, stream, internal_streams, n_internal_streams,
-    rowMajorIndex, rowMajorQuery, &start_indices_long);
+  cuda_utils cutils(handle);
+  opg_knn(params, cutils);
 }
-
-void broadcast_query(float *query, size_t batch_input_elms, int part_rank,
-                     std::set<int> idxRanks, const raft::comms::comms_t &comm,
-                     cudaStream_t stream) {
-  int my_rank = comm.get_rank();
-
-  int request_idx = 0;
-  std::vector<raft::comms::request_t> requests;
-  if (part_rank == my_rank) {
-    int idx_rank_size = idxRanks.size();
-    if (idxRanks.find(my_rank) != idxRanks.end()) {
-      --idx_rank_size;
-    }
-
-    requests.resize(idx_rank_size);
-
-    for (int rank : idxRanks) {
-      if (rank != my_rank) {
-        comm.isend(query, batch_input_elms, rank, 0,
-                   requests.data() + request_idx);
-        ++request_idx;
-      }
-    }
-
-  } else {
-    requests.resize(1);
-    comm.irecv(query, batch_input_elms, part_rank, 0,
-               requests.data() + request_idx);
-    ++request_idx;
-  }
-
-  try {
-    comm.waitall(requests.size(), requests.data());
-  } catch (raft::exception &e) {
-    std::cout << "FAILRE!" << std::endl;
-  }
-}
-
-/**
-   * All non-root index ranks send the results for the current
-   * query batch to the root rank for the batch.
-   */
-void exchange_results(device_buffer<int64_t> &res_I,
-                      device_buffer<float> &res_D,
-                      const raft::comms::comms_t &comm, int part_rank,
-                      std::set<int> idxRanks, cudaStream_t stream,
-                      size_t cur_batch_size, int k, int local_parts_completed) {
-  int my_rank = comm.get_rank();
-
-  size_t batch_elms = cur_batch_size * k;
-
-  int request_idx = 0;
-  std::vector<raft::comms::request_t> requests;
-  if (part_rank != my_rank) {
-    requests.resize(2);
-    comm.isend(res_I.data(), batch_elms, part_rank, 0,
-               requests.data() + request_idx);
-    ++request_idx;
-
-    comm.isend(res_D.data(), batch_elms, part_rank, 0,
-               requests.data() + request_idx);
-    ++request_idx;
-
-  } else {
-    bool part_rank_is_idx = idxRanks.find(part_rank) != idxRanks.end();
-    int idx_rank_size = idxRanks.size();
-
-    int num_received = 0;
-
-    // if root rank is an index, it will already have
-    // query data, so no need to receive from it.
-    if (part_rank_is_idx) {
-      num_received = 1;  // root rank will take the zeroth slot
-      res_I.resize(batch_elms * idx_rank_size, stream);
-      res_D.resize(batch_elms * idx_rank_size, stream);
-      --idx_rank_size;
-    } else {
-      res_I.resize(batch_elms * idx_rank_size, stream);
-      res_D.resize(batch_elms * idx_rank_size, stream);
-    }
-
-    requests.resize(2 * idx_rank_size);
-    for (int rank : idxRanks) {
-      if (rank != my_rank) {
-        size_t batch_offset = batch_elms * num_received;
-
-        comm.irecv(res_I.data() + batch_offset, batch_elms, rank, 0,
-                   requests.data() + request_idx);
-        ++request_idx;
-        comm.irecv(res_D.data() + batch_offset, batch_elms, rank, 0,
-                   requests.data() + request_idx);
-        ++request_idx;
-        ++num_received;
-      }
-    }
-  }
-
-  try {
-    comm.waitall(requests.size(), requests.data());
-  } catch (raft::exception &e) {
-    std::cout << "FAILURE!" << std::endl;
-  }
-}
-
-void brute_force_knn(raft::handle_t &handle,
-                     std::vector<Matrix::Data<int64_t> *> &out_I,
-                     std::vector<Matrix::floatData_t *> &out_D,
-                     std::vector<Matrix::floatData_t *> &idx_data,
-                     Matrix::PartDescriptor &idx_desc,
-                     std::vector<Matrix::floatData_t *> &query_data,
-                     Matrix::PartDescriptor &query_desc, bool rowMajorIndex,
-                     bool rowMajorQuery, int k, size_t batch_size,
-                     bool verbose) {
-  ASSERT(k <= 1024, "k must be <= 1024");
-  ASSERT(batch_size > 0, "max_batch_size must be > 0");
-  ASSERT(k < idx_desc.M, "k must be less than the total number of query rows");
-  for (Matrix::RankSizePair *rsp : idx_desc.partsToRanks) {
-    ASSERT(rsp->size >= k,
-           "k must be <= the number of rows in the smallest index partition.");
-  }
-
-  const raft::handle_t &h = handle;
-  const auto &comm = h.get_comms();
-  cudaStream_t stream = h.get_stream();
-
-  const auto allocator = h.get_device_allocator();
-
-  int my_rank = comm.get_rank();
-
-  std::set<int> idxRanks = idx_desc.uniqueRanks();
-
-  std::vector<Matrix::RankSizePair *> local_idx_parts =
-    idx_desc.blocksOwnedBy(comm.get_rank());
-
-  int local_parts_completed = 0;
-
-  // Loop through query parts for all ranks
-  for (int i = 0; i < query_desc.totalBlocks(); i++) {
-    Matrix::RankSizePair *partition = query_desc.partsToRanks[i];
-    int part_rank = partition->rank;
-    size_t part_n_rows = partition->size;
-
-    size_t total_batches = raft::ceildiv(part_n_rows, batch_size);
-    size_t total_n_processed = 0;
-
-    // Loop through batches for each query part
-    for (int cur_batch = 0; cur_batch < total_batches; cur_batch++) {
-      size_t cur_batch_size = batch_size;
-
-      if (cur_batch == total_batches - 1)
-        cur_batch_size = part_n_rows - (cur_batch * batch_size);
-
-      if (my_rank == part_rank && verbose) {
-        std::cout << "Root Rank is " << my_rank << std::endl;
-      }
-
-      /**
-         * Root broadcasts batch to all other ranks
-         */
-      if (verbose) {
-        std::cout << "Rank " << my_rank << ": Performing Broadcast"
-                  << std::endl;
-      }
-
-      int my_rank = comm.get_rank();
-      device_buffer<float> part_data(allocator, stream, 0);
-
-      size_t batch_input_elms = cur_batch_size * query_desc.N;
-      size_t batch_input_offset = batch_input_elms * cur_batch;
-
-      float *cur_query_ptr;
-
-      device_buffer<float> tmp_batch_buf(allocator, stream, 0);
-      // current partition's owner rank broadcasts
-      if (part_rank == my_rank) {
-        Matrix::Data<float> *data = query_data[local_parts_completed];
-
-        // If query is column major and total_batches > 0, create a
-        // temporary buffer for the batch so that we can stack rows.
-        if (!rowMajorQuery && total_batches > 1) {
-          tmp_batch_buf.resize(batch_input_elms, stream);
-          for (int col_data = 0; col_data < query_desc.N; col_data++) {
-            raft::copy(
-              tmp_batch_buf.data() + (col_data * cur_batch_size),
-              data->ptr + ((col_data * part_n_rows) + total_n_processed),
-              cur_batch_size, stream);
-          }
-          cur_query_ptr = tmp_batch_buf.data();
-
-        } else {
-          cur_query_ptr = data->ptr + batch_input_offset;
-        }
-
-        // all other (index) ranks receive
-      } else if (idxRanks.find(my_rank) != idxRanks.end()) {
-        part_data.resize(batch_input_elms, stream);
-        cur_query_ptr = part_data.data();
-      }
-
-      bool my_rank_is_idx = idxRanks.find(my_rank) != idxRanks.end();
-
-      /**
-         * Send query to index partitions
-         */
-      if (my_rank == part_rank || my_rank_is_idx)
-        broadcast_query(cur_query_ptr, batch_input_elms, part_rank, idxRanks,
-                        comm, stream);
-
-      device_buffer<int64_t> res_I(allocator, stream);
-      device_buffer<float> res_D(allocator, stream);
-      if (my_rank_is_idx) {
-        /**
-           * All index ranks perform local KNN
-           */
-        if (verbose)
-          std::cout << "Rank " << my_rank << ": Performing Local KNN"
-                    << std::endl;
-
-        size_t batch_knn_elms = k * cur_batch_size;
-
-        res_I.resize(batch_knn_elms, stream);
-        res_D.resize(batch_knn_elms, stream);
-
-        // Offset nearest neighbor index matrix by partition indices
-        std::vector<size_t> start_indices = idx_desc.startIndices(my_rank);
-
-        cudaStream_t int_streams[handle.get_num_internal_streams()];
-        for (int i = 0; i < handle.get_num_internal_streams(); i++) {
-          int_streams[i] = handle.get_internal_stream(i);
-        }
-
-        perform_local_knn(res_I.data(), res_D.data(), idx_data, idx_desc,
-                          local_idx_parts, start_indices, stream, &*int_streams,
-                          handle.get_num_internal_streams(),
-                          handle.get_device_allocator(), cur_batch_size, k,
-                          cur_query_ptr, rowMajorIndex, rowMajorQuery);
-
-        // Synchronize before sending
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-
-        /**
-           * Ranks exchange results.
-           * Partition owner receives. All other ranks send.
-           */
-        if (verbose)
-          std::cout << "Rank " << my_rank << ": Exchanging results"
-                    << std::endl;
-        exchange_results(res_I, res_D, comm, part_rank, idxRanks, stream,
-                         cur_batch_size, k, local_parts_completed);
-      }
-
-      /**
-         * Root rank performs local reduce
-         */
-      if (part_rank == my_rank) {
-        if (verbose)
-          std::cout << "Rank " << my_rank << ": Performing Reduce" << std::endl;
-
-        reduce(out_I[local_parts_completed], out_D[local_parts_completed],
-               res_I, res_D, idx_desc, comm, allocator, stream, cur_batch_size,
-               k, local_parts_completed, cur_batch, total_n_processed,
-               idxRanks);
-
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-        CUDA_CHECK(cudaPeekAtLastError());
-
-        if (verbose)
-          std::cout << "Rank " << my_rank << ": Finished Reduce" << std::endl;
-      }
-
-      total_n_processed += cur_batch_size;
-    }
-
-    if (my_rank == part_rank) local_parts_completed++;
-  }
-}
-
 };  // namespace opg
 };  // namespace KNN
 };  // namespace ML
diff --git a/cpp/src/knn/knn_opg_common.cu b/cpp/src/knn/knn_opg_common.cu
deleted file mode 100644
index e3726aa981..0000000000
--- a/cpp/src/knn/knn_opg_common.cu
+++ /dev/null
@@ -1,759 +0,0 @@
-/*
-* Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO LICENSEE:
-*
-* This source code and/or documentation ("Licensed Deliverables") are
-* subject to NVIDIA intellectual property rights under U.S. and
-* international Copyright laws.
-*
-* These Licensed Deliverables contained herein is PROPRIETARY and
-* CONFIDENTIAL to NVIDIA and is being provided under the terms and
-* conditions of a form of NVIDIA software license agreement by and
-* between NVIDIA and Licensee ("License Agreement") or electronically
-* accepted by Licensee.  Notwithstanding any terms or conditions to
-* the contrary in the License Agreement, reproduction or disclosure
-* of the Licensed Deliverables to any third party without the express
-* written consent of NVIDIA is prohibited.
-*
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
-* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THESE LICENSED DELIVERABLES.
-*
-* U.S. Government End Users.  These Licensed Deliverables are a
-* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-* 1995), consisting of "commercial computer software" and "commercial
-* computer software documentation" as such terms are used in 48
-* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
-* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-* U.S. Government End Users acquire the Licensed Deliverables with
-* only those rights set forth herein.
-*
-* Any use of the Licensed Deliverables in individual and commercial
-* software must include, in the user documentation and internal
-* comments to the code, the above Disclaimer and U.S. Government End
-* Users Notice.
-*/
-
-#include <cuml/neighbors/knn_mg.hpp>
-#include <selection/knn.cuh>
-
-#include <common/cumlHandle.hpp>
-
-#include <common/device_buffer.hpp>
-#include <cuml/common/cuml_allocator.hpp>
-#include <cuml/common/logger.hpp>
-#include <raft/comms/comms.hpp>
-
-#include <set>
-
-#include <raft/cudart_utils.h>
-#include <raft/cuda_utils.cuh>
-
-namespace ML {
-namespace KNN {
-namespace opg {
-
-namespace knn_common {
-
-/**
- * This function copies the labels associated to the locally merged indices
- * from the index partitions to a merged array of labels
- * @param[out] out merged labels
- * @param[in] knn_indices merged indices
- * @param[in] parts unmerged labels in partitions
- * @param[in] offsets array splitting the partitions making it possible
- * to identify the origin partition of an nearest neighbor index
- * @param[in] cur_batch_size current batch size
- * @param[in] n_parts number of partitions
- * @param[in] n_labels number of labels to write (batch_size * n_outputs)
- */
-template <typename T, int TPB_X>
-__global__ void copy_label_outputs_from_index_parts_kernel(
-  T *out, int64_t *knn_indices, T **parts, int64_t *offsets,
-  size_t cur_batch_size, int n_parts, int n_labels) {
-  int64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (i >= n_labels) return;
-  int64_t nn_idx = knn_indices[i];
-  int part_idx = 0;
-  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++)
-    ;
-  part_idx = min(max((int)0, part_idx - 1), n_parts - 1);
-  int64_t offset = nn_idx - offsets[part_idx];
-  out[i] = parts[part_idx][offset];
-}
-
-template <typename T>
-void copy_label_outputs_from_index_parts(T *out, int64_t *knn_indices,
-                                         std::vector<std::vector<T *>> &y,
-                                         size_t cur_batch_size, int k,
-                                         int n_outputs, int my_rank,
-                                         Matrix::PartDescriptor &index_desc,
-                                         std::shared_ptr<deviceAllocator> alloc,
-                                         cudaStream_t stream) {
-  const int TPB_X = 256;
-  int n_labels = cur_batch_size * k;
-  dim3 grid(raft::ceildiv(n_labels, TPB_X));
-  dim3 blk(TPB_X);
-
-  std::vector<Matrix::RankSizePair *> &idxPartsToRanks =
-    index_desc.partsToRanks;
-  int64_t offset = 0;
-  std::vector<int64_t> offsets_h;
-  for (auto &rsp : idxPartsToRanks) {
-    if (rsp->rank == my_rank) {
-      offsets_h.push_back(offset);
-    }
-    offset += rsp->size;
-  }
-  size_t n_parts = offsets_h.size();
-  device_buffer<int64_t> offsets_d(alloc, stream, n_parts);
-  raft::update_device(offsets_d.data(), offsets_h.data(), n_parts, stream);
-
-  std::vector<T *> parts_h(n_parts);
-  device_buffer<T *> parts_d(alloc, stream, n_parts);
-  for (int o = 0; o < n_outputs; o++) {
-    for (int p = 0; p < n_parts; p++) {
-      parts_h[p] = y[p][o];
-    }
-    raft::update_device(parts_d.data(), parts_h.data(), n_parts, stream);
-
-    copy_label_outputs_from_index_parts_kernel<T, TPB_X>
-      <<<grid, blk, 0, stream>>>(out + (o * n_labels), knn_indices,
-                                 parts_d.data(), offsets_d.data(),
-                                 cur_batch_size, n_parts, n_labels);
-  }
-}
-
-/**
- * This function copies the labels associated to the merged indices
- * from the unmerged to a merged (n_ranks times smaller) array of labels
- * @param[out] outputs merged labels
- * @param[in] knn_indices merged indices
- * @param[in] unmerged_outputs unmerged labels
- * @param[in] unmerged_knn_indices unmerged indices
- * @param[in] offsets array splitting the partitions making it possible
- * to identify the origin partition of an nearest neighbor index
- * @param[in] parts_to_ranks get rank index from index partition index,
- * informative to find positions as the unmerged arrays are built
- * so that ranks are in order (unlike partitions)
- * @param[in] nearest_neighbors number of nearest neighbors to look for in query
- * @param[in] n_outputs number of targets
- * @param[in] n_labels number of labels to write (batch_size * n_outputs)
- * @param[in] n_parts number of index partitions
- * @param[in] n_ranks number of index ranks
- */
-template <typename T, int TPB_X>
-__global__ void merge_labels_kernel(T *outputs, int64_t *knn_indices,
-                                    T *unmerged_outputs,
-                                    int64_t *unmerged_knn_indices,
-                                    int64_t *offsets, int *parts_to_ranks,
-                                    int nearest_neighbors, int n_outputs,
-                                    int n_labels, int n_parts, int n_ranks) {
-  int64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (i >= n_labels) return;
-  int64_t nn_idx = knn_indices[i];
-  int part_idx = 0;
-  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++)
-    ;
-  part_idx = min(max((int)0, part_idx - 1), n_parts - 1);
-  int rank_idx = parts_to_ranks[part_idx];
-  int inbatch_idx = i / nearest_neighbors;
-  int64_t elm_idx = (rank_idx * n_labels) + inbatch_idx * nearest_neighbors;
-  for (int k = 0; k < nearest_neighbors; k++) {
-    if (nn_idx == unmerged_knn_indices[elm_idx + k]) {
-      for (int o = 0; o < n_outputs; o++) {
-        outputs[(o * n_labels) + i] =
-          unmerged_outputs[(o * n_ranks * n_labels) + elm_idx + k];
-      }
-      return;
-    }
-  }
-}
-
-template <typename T>
-void merge_labels(T *output, int64_t *knn_indices, T *unmerged_outputs,
-                  int64_t *unmerged_knn_indices, int cur_batch_size,
-                  int nearest_neighbors, int n_outputs,
-                  Matrix::PartDescriptor &index_desc,
-                  std::shared_ptr<deviceAllocator> alloc, cudaStream_t stream) {
-  const int TPB_X = 256;
-  int n_labels = cur_batch_size * nearest_neighbors;
-  dim3 grid(raft::ceildiv(n_labels, TPB_X));
-  dim3 blk(TPB_X);
-
-  std::set<int> idxRanks = index_desc.uniqueRanks();
-  std::vector<Matrix::RankSizePair *> &idxPartsToRanks =
-    index_desc.partsToRanks;
-
-  int offset = 0;
-  std::vector<int64_t> offsets_h;
-  for (auto &rsp : idxPartsToRanks) {
-    offsets_h.push_back(offset);
-    offset += rsp->size;
-  }
-  device_buffer<int64_t> offsets_d(alloc, stream, offsets_h.size());
-  raft::update_device(offsets_d.data(), offsets_h.data(), offsets_h.size(),
-                      stream);
-
-  std::vector<int> parts_to_ranks_h;
-  for (auto &rsp : idxPartsToRanks) {
-    int i = 0;
-    for (int rank : idxRanks) {
-      if (rank == rsp->rank) {
-        parts_to_ranks_h.push_back(i);
-      }
-      ++i;
-    }
-  }
-  device_buffer<int> parts_to_ranks_d(alloc, stream, parts_to_ranks_h.size());
-  raft::update_device(parts_to_ranks_d.data(), parts_to_ranks_h.data(),
-                      parts_to_ranks_h.size(), stream);
-
-  merge_labels_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
-    output, knn_indices, unmerged_outputs, unmerged_knn_indices,
-    offsets_d.data(), parts_to_ranks_d.data(), nearest_neighbors, n_outputs,
-    n_labels, idxPartsToRanks.size(), idxRanks.size());
-}
-
-template <typename T>
-void launch_local_operation(T *out, int64_t *knn_indices, std::vector<T *> y,
-                            size_t total_labels, size_t cur_batch_size, int k,
-                            const std::shared_ptr<deviceAllocator> alloc,
-                            cudaStream_t stream, cudaStream_t *int_streams,
-                            int n_int_streams, bool probas_only,
-                            std::vector<float *> *probas,
-                            std::vector<int *> *uniq_labels,
-                            std::vector<int> *n_unique);
-
-template <>
-void launch_local_operation<int>(
-  int *out, int64_t *knn_indices, std::vector<int *> y, size_t n_index_rows,
-  size_t n_query_rows, int k, const std::shared_ptr<deviceAllocator> alloc,
-  cudaStream_t stream, cudaStream_t *int_streams, int n_int_streams,
-  bool probas_only, std::vector<float *> *probas,
-  std::vector<int *> *uniq_labels, std::vector<int> *n_unique) {
-  if (probas_only) {
-    MLCommon::Selection::class_probs<32, true>(
-      *probas, nullptr, y, n_index_rows, n_query_rows, k, *uniq_labels,
-      *n_unique, alloc, stream, &int_streams[0], n_int_streams);
-  } else {
-    MLCommon::Selection::knn_classify<32, true>(
-      out, nullptr, y, n_index_rows, n_query_rows, k, *uniq_labels, *n_unique,
-      alloc, stream, &int_streams[0], n_int_streams);
-  }
-}
-
-template <>
-void launch_local_operation<float>(
-  float *out, int64_t *knn_indices, std::vector<float *> y, size_t n_index_rows,
-  size_t n_query_rows, int k, const std::shared_ptr<deviceAllocator> alloc,
-  cudaStream_t stream, cudaStream_t *int_streams, int n_int_streams,
-  bool probas_only, std::vector<float *> *probas,
-  std::vector<int *> *uniq_labels, std::vector<int> *n_unique) {
-  MLCommon::Selection::knn_regress<float, 32, true>(
-    out, nullptr, y, n_index_rows, n_query_rows, k, stream, &int_streams[0],
-    n_int_streams);
-}
-
-template <typename T>
-void perform_local_operation(T *out, int64_t *knn_indices, T *labels,
-                             size_t cur_batch_size, int k, int n_outputs,
-                             raft::handle_t &h, bool probas_only = false,
-                             std::vector<float *> *probas = nullptr,
-                             std::vector<int *> *uniq_labels = nullptr,
-                             std::vector<int> *n_unique = nullptr) {
-  size_t n_labels = cur_batch_size * k;
-
-  std::vector<T *> y(n_outputs);
-  for (int o = 0; o < n_outputs; o++) {
-    y[o] = labels + (o * n_labels);
-  }
-
-  cudaStream_t stream = h.get_stream();
-  const auto alloc = h.get_device_allocator();
-
-  int n_int_streams = h.get_num_internal_streams();
-  cudaStream_t int_streams[n_int_streams];
-  for (int i = 0; i < n_int_streams; i++) {
-    int_streams[i] = h.get_internal_stream(i);
-  }
-
-  launch_local_operation(out, knn_indices, y, n_labels, cur_batch_size, k,
-                         alloc, stream, int_streams, n_int_streams, probas_only,
-                         probas, uniq_labels, n_unique);
-}
-
-template <typename T>
-void reduce(raft::handle_t &handle, std::vector<Matrix::Data<T> *> *out,
-            std::vector<Matrix::Data<int64_t> *> *out_I,
-            std::vector<Matrix::floatData_t *> *out_D, device_buffer<T> &res,
-            device_buffer<int64_t> &res_I, device_buffer<float> &res_D,
-            Matrix::PartDescriptor &index_desc, size_t cur_batch_size, int k,
-            int n_outputs, int local_parts_completed, size_t total_n_processed,
-            bool probas_only = false,
-            std::vector<std::vector<float *>> *probas = nullptr,
-            std::vector<int *> *uniq_labels = nullptr,
-            std::vector<int> *n_unique = nullptr) {
-  const raft::handle_t &h = handle;
-  cudaStream_t stream = h.get_stream();
-  const auto alloc = h.get_device_allocator();
-
-  std::set<int> idxRanks = index_desc.uniqueRanks();
-  device_buffer<int64_t> trans(alloc, stream, idxRanks.size());
-  CUDA_CHECK(cudaMemsetAsync(trans.data(), 0, idxRanks.size() * sizeof(int64_t),
-                             stream));
-
-  size_t batch_offset = total_n_processed * k;
-
-  T *outputs = nullptr;
-  int64_t *indices = nullptr;
-  float *distances = nullptr;
-
-  device_buffer<int64_t> *indices_b;
-  device_buffer<float> *distances_b;
-  std::vector<float *> probas_with_offsets;
-
-  if (probas_only) {
-    indices_b = new device_buffer<int64_t>(alloc, stream, cur_batch_size * k);
-    distances_b = new device_buffer<float>(alloc, stream, cur_batch_size * k);
-    indices = indices_b->data();
-    distances = distances_b->data();
-
-    std::vector<float *> &probas_part = probas->at(local_parts_completed);
-    for (int i = 0; i < n_outputs; i++) {
-      float *ptr = probas_part[i];
-      int n_unique_classes = n_unique->at(i);
-      probas_with_offsets.push_back(ptr +
-                                    (total_n_processed * n_unique_classes));
-    }
-  } else {
-    outputs =
-      out->at(local_parts_completed)->ptr + (n_outputs * total_n_processed);
-    indices = out_I->at(local_parts_completed)->ptr + batch_offset;
-    distances = out_D->at(local_parts_completed)->ptr + batch_offset;
-  }
-
-  MLCommon::Selection::knn_merge_parts(res_D.data(), res_I.data(), distances,
-                                       indices, cur_batch_size, idxRanks.size(),
-                                       k, stream, trans.data());
-
-  device_buffer<T> merged_outputs_b(alloc, stream,
-                                    n_outputs * cur_batch_size * k);
-  T *merged_outputs = merged_outputs_b.data();
-  merge_labels(merged_outputs, indices, res.data(), res_I.data(),
-               cur_batch_size, k, n_outputs, index_desc, alloc, stream);
-
-  perform_local_operation<T>(outputs, indices, merged_outputs, cur_batch_size,
-                             k, n_outputs, handle, probas_only,
-                             &probas_with_offsets, uniq_labels, n_unique);
-
-  if (probas_only) {
-    delete indices_b;
-    delete distances_b;
-  }
-}
-
-void perform_local_knn(int64_t *res_I, float *res_D,
-                       std::vector<Matrix::floatData_t *> &idx_data,
-                       Matrix::PartDescriptor &idx_desc,
-                       std::vector<Matrix::RankSizePair *> &local_idx_parts,
-                       std::vector<size_t> &start_indices, cudaStream_t stream,
-                       cudaStream_t *internal_streams, int n_internal_streams,
-                       std::shared_ptr<deviceAllocator> allocator,
-                       size_t cur_batch_size, int k, float *cur_query_ptr,
-                       bool rowMajorIndex, bool rowMajorQuery) {
-  std::vector<float *> ptrs(idx_data.size());
-  std::vector<int> sizes(idx_data.size());
-
-  for (int cur_idx = 0; cur_idx < idx_data.size(); cur_idx++) {
-    ptrs[cur_idx] = idx_data[cur_idx]->ptr;
-    sizes[cur_idx] = local_idx_parts[cur_idx]->size;
-  }
-
-  // PartDescriptor uses size_t while FAISS uses int64_t
-  // so we need to do a quick conversion.
-  std::vector<int64_t> start_indices_long;
-  for (size_t start_index : start_indices)
-    start_indices_long.push_back((int64_t)start_index);
-
-  // ID ranges need to be offset by each local partition's
-  // starting indices.
-  MLCommon::Selection::brute_force_knn(
-    ptrs, sizes, (int)idx_desc.N, cur_query_ptr, (int)cur_batch_size, res_I,
-    res_D, k, allocator, stream, internal_streams, n_internal_streams,
-    rowMajorIndex, rowMajorQuery, &start_indices_long);
-}
-
-void broadcast_query(float *query, size_t batch_input_elms, int part_rank,
-                     std::set<int> idxRanks, const raft::comms::comms_t &comm,
-                     cudaStream_t stream) {
-  int my_rank = comm.get_rank();
-
-  int request_idx = 0;
-  std::vector<raft::comms::request_t> requests;
-  if (part_rank == my_rank) {
-    int idx_rank_size = idxRanks.size();
-    if (idxRanks.find(my_rank) != idxRanks.end()) {
-      --idx_rank_size;
-    }
-
-    requests.resize(idx_rank_size);
-
-    for (int rank : idxRanks) {
-      if (rank != my_rank) {
-        comm.isend(query, batch_input_elms, rank, 0,
-                   requests.data() + request_idx);
-        ++request_idx;
-      }
-    }
-
-  } else {
-    requests.resize(1);
-    comm.irecv(query, batch_input_elms, part_rank, 0,
-               requests.data() + request_idx);
-    ++request_idx;
-  }
-
-  try {
-    comm.waitall(requests.size(), requests.data());
-  } catch (raft::exception &e) {
-    CUML_LOG_DEBUG("FAILURE!");
-  }
-}
-
-/**
- * All non-root index ranks send the results for the current
- * query batch to the root rank for the batch.
- */
-template <typename T>
-void exchange_results(device_buffer<T> &res, device_buffer<int64_t> &res_I,
-                      device_buffer<float> &res_D,
-                      const raft::comms::comms_t &comm, int part_rank,
-                      std::set<int> idxRanks, cudaStream_t stream,
-                      std::shared_ptr<deviceAllocator> alloc,
-                      size_t cur_batch_size, int k, int n_outputs,
-                      int local_parts_completed) {
-  int my_rank = comm.get_rank();
-
-  size_t batch_elms = cur_batch_size * k;
-
-  int request_idx = 0;
-  std::vector<raft::comms::request_t> requests;
-  if (part_rank != my_rank) {
-    requests.resize(2 + n_outputs);
-    comm.isend(res_I.data(), batch_elms, part_rank, 0,
-               requests.data() + request_idx);
-    ++request_idx;
-
-    comm.isend(res_D.data(), batch_elms, part_rank, 0,
-               requests.data() + request_idx);
-    ++request_idx;
-
-    for (size_t o = 0; o < n_outputs; o++) {
-      comm.isend(res.data() + (o * batch_elms), batch_elms, part_rank, 0,
-                 requests.data() + request_idx);
-      ++request_idx;
-    }
-  } else {
-    bool part_rank_is_idx = idxRanks.find(part_rank) != idxRanks.end();
-    size_t idx_rank_size = idxRanks.size();
-
-    // if root rank is an index, it will already have
-    // query data, so no need to receive from it.
-    res.resize(batch_elms * n_outputs * idx_rank_size, stream);
-    res_I.resize(batch_elms * idx_rank_size, stream);
-    res_D.resize(batch_elms * idx_rank_size, stream);
-    if (part_rank_is_idx) {
-      --idx_rank_size;
-      int i = 0;
-      for (int rank : idxRanks) {
-        if (rank == my_rank) {
-          size_t batch_offset = batch_elms * i;
-
-          // Indices and distances are stored in rank order
-          raft::copy_async(res_I.data() + batch_offset, res_I.data(),
-                           batch_elms, stream);
-          raft::copy_async(res_D.data() + batch_offset, res_D.data(),
-                           batch_elms, stream);
-
-          device_buffer<T> tmp_res(alloc, stream, n_outputs * batch_elms);
-          raft::copy_async(tmp_res.data(), res.data(), tmp_res.size(), stream);
-
-          for (int o = 0; o < n_outputs; ++o) {
-            // Outputs are stored in target order and then in rank order
-            raft::copy_async(
-              res.data() + (o * idxRanks.size() * batch_elms) + batch_offset,
-              tmp_res.data() + (o * batch_elms), batch_elms, stream);
-          }
-          CUDA_CHECK(cudaStreamSynchronize(stream));
-          break;
-        }
-        i++;
-      }
-    }
-
-    int num_received = 0;
-    requests.resize((2 + n_outputs) * idx_rank_size);
-    for (int rank : idxRanks) {
-      if (rank != my_rank) {
-        size_t batch_offset = batch_elms * num_received;
-
-        // Indices and distances are stored in rank order
-        comm.irecv(res_I.data() + batch_offset, batch_elms, rank, 0,
-                   requests.data() + request_idx);
-        ++request_idx;
-        comm.irecv(res_D.data() + batch_offset, batch_elms, rank, 0,
-                   requests.data() + request_idx);
-        ++request_idx;
-
-        for (size_t o = 0; o < n_outputs; o++) {
-          // Outputs are stored in target order and then in rank order
-          T *r = res.data() + (o * idxRanks.size() * batch_elms) + batch_offset;
-          comm.irecv(r, batch_elms, rank, 0, requests.data() + request_idx);
-          ++request_idx;
-        }
-        ++num_received;
-      } else if (part_rank_is_idx) {
-        /**
-         * Prevents overwriting data when the owner of currently
-         * processed query partition has itself some index partition(s)
-         */
-        ++num_received;
-      }
-    }
-  }
-
-  try {
-    comm.waitall(requests.size(), requests.data());
-  } catch (raft::exception &e) {
-    CUML_LOG_DEBUG("FAILURE!");
-  }
-}
-
-template <typename T>
-void opg_knn(raft::handle_t &handle, std::vector<Matrix::Data<T> *> *out,
-             std::vector<Matrix::Data<int64_t> *> *out_I,
-             std::vector<Matrix::floatData_t *> *out_D,
-             std::vector<Matrix::floatData_t *> &idx_data,
-             Matrix::PartDescriptor &idx_desc,
-             std::vector<Matrix::floatData_t *> &query_data,
-             Matrix::PartDescriptor &query_desc,
-             std::vector<std::vector<T *>> &y, bool rowMajorIndex,
-             bool rowMajorQuery, int k, int n_outputs, size_t batch_size,
-             bool verbose, std::vector<std::vector<float *>> *probas = nullptr,
-             std::vector<int *> *uniq_labels = nullptr,
-             std::vector<int> *n_unique = nullptr, bool probas_only = false) {
-  ASSERT(k <= 1024, "k must be <= 1024");
-  ASSERT(batch_size > 0, "max_batch_size must be > 0");
-  ASSERT(k < idx_desc.M, "k must be less than the total number of query rows");
-  for (Matrix::RankSizePair *rsp : idx_desc.partsToRanks) {
-    ASSERT(rsp->size >= k,
-           "k must be <= the number of rows in the smallest index partition.");
-  }
-
-  const raft::handle_t &h = handle;
-  const auto &comm = h.get_comms();
-  cudaStream_t stream = h.get_stream();
-
-  const auto allocator = h.get_device_allocator();
-
-  int my_rank = comm.get_rank();
-
-  std::set<int> idxRanks = idx_desc.uniqueRanks();
-
-  std::vector<Matrix::RankSizePair *> local_idx_parts =
-    idx_desc.blocksOwnedBy(comm.get_rank());
-
-  int local_parts_completed = 0;
-
-  // Loop through query parts for all ranks
-  for (int i = 0; i < query_desc.totalBlocks(); i++) {
-    Matrix::RankSizePair *partition = query_desc.partsToRanks[i];
-    int part_rank = partition->rank;
-    size_t part_n_rows = partition->size;
-
-    size_t total_batches = raft::ceildiv(part_n_rows, batch_size);
-    size_t total_n_processed = 0;
-
-    // Loop through batches for each query part
-    for (int cur_batch = 0; cur_batch < total_batches; cur_batch++) {
-      size_t cur_batch_size = batch_size;
-
-      if (cur_batch == total_batches - 1)
-        cur_batch_size = part_n_rows - (cur_batch * batch_size);
-
-      if (my_rank == part_rank) CUML_LOG_DEBUG("Root Rank is %d", my_rank);
-
-      /**
-       * Root broadcasts batch to all other ranks
-       */
-      CUML_LOG_DEBUG("Rank %d: Performing Broadcast", my_rank);
-
-      int my_rank = comm.get_rank();
-      device_buffer<float> part_data(allocator, stream, 0);
-
-      size_t batch_input_elms = cur_batch_size * query_desc.N;
-      size_t batch_input_offset = batch_input_elms * cur_batch;
-
-      float *cur_query_ptr;
-
-      device_buffer<float> tmp_batch_buf(allocator, stream, 0);
-      // current partition's owner rank broadcasts
-      if (part_rank == my_rank) {
-        Matrix::Data<float> *data = query_data[local_parts_completed];
-
-        // If query is column major and total_batches > 0, create a
-        // temporary buffer for the batch so that we can stack rows.
-        if (!rowMajorQuery && total_batches > 1) {
-          tmp_batch_buf.resize(batch_input_elms, stream);
-          for (int col_data = 0; col_data < query_desc.N; col_data++) {
-            raft::copy(
-              tmp_batch_buf.data() + (col_data * cur_batch_size),
-              data->ptr + ((col_data * part_n_rows) + total_n_processed),
-              cur_batch_size, stream);
-          }
-          cur_query_ptr = tmp_batch_buf.data();
-
-        } else {
-          cur_query_ptr = data->ptr + batch_input_offset;
-        }
-
-        // all other (index) ranks receive
-      } else if (idxRanks.find(my_rank) != idxRanks.end()) {
-        part_data.resize(batch_input_elms, stream);
-        cur_query_ptr = part_data.data();
-      }
-
-      bool my_rank_is_idx = idxRanks.find(my_rank) != idxRanks.end();
-
-      /**
-       * Send query to index partitions
-       */
-      if (my_rank == part_rank || my_rank_is_idx)
-        broadcast_query(cur_query_ptr, batch_input_elms, part_rank, idxRanks,
-                        comm, stream);
-
-      device_buffer<T> res(allocator, stream);
-      device_buffer<int64_t> res_I(allocator, stream);
-      device_buffer<float> res_D(allocator, stream);
-      if (my_rank_is_idx) {
-        /**
-         * All index ranks perform local KNN
-         */
-        CUML_LOG_DEBUG("Rank %d: Performing Local KNN", my_rank);
-
-        size_t batch_knn_elms = k * cur_batch_size;
-
-        res.resize(batch_knn_elms * n_outputs, stream);
-        res_I.resize(batch_knn_elms, stream);
-        res_D.resize(batch_knn_elms, stream);
-
-        // Offset nearest neighbor index matrix by partition indices
-        std::vector<size_t> start_indices = idx_desc.startIndices(my_rank);
-
-        cudaStream_t int_streams[handle.get_num_internal_streams()];
-        for (int i = 0; i < handle.get_num_internal_streams(); i++) {
-          int_streams[i] = handle.get_internal_stream(i);
-        }
-
-        perform_local_knn(res_I.data(), res_D.data(), idx_data, idx_desc,
-                          local_idx_parts, start_indices, stream, int_streams,
-                          handle.get_num_internal_streams(),
-                          handle.get_device_allocator(), cur_batch_size, k,
-                          cur_query_ptr, rowMajorIndex, rowMajorQuery);
-
-        copy_label_outputs_from_index_parts(
-          res.data(), res_I.data(), y, (size_t)cur_batch_size, (int)k,
-          (int)n_outputs, my_rank, idx_desc, handle.get_device_allocator(),
-          stream);
-
-        // Synchronize before sending
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-        CUDA_CHECK(cudaPeekAtLastError());
-      }
-
-      if (part_rank == my_rank || my_rank_is_idx) {
-        /**
-         * Ranks exchange results.
-         * Each rank having index partition(s) sends
-         * its local results (my_rank_is_idx)
-         * Additionally the owner of currently processed query partition
-         * receives and performs a reduce even if it has
-         * no index partition (part_rank == my_rank)
-         */
-        CUML_LOG_DEBUG("Rank %d: Exchanging results", my_rank);
-        exchange_results(res, res_I, res_D, comm, part_rank, idxRanks, stream,
-                         handle.get_device_allocator(), cur_batch_size, k,
-                         n_outputs, local_parts_completed);
-      }
-
-      /**
-       * Root rank performs local reduce
-       */
-      if (part_rank == my_rank) {
-        CUML_LOG_DEBUG("Rank %d: Performing Reduce", my_rank);
-
-        reduce(handle, out, out_I, out_D, res, res_I, res_D, idx_desc,
-               cur_batch_size, k, n_outputs, local_parts_completed,
-               total_n_processed, probas_only, probas, uniq_labels, n_unique);
-
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-        CUDA_CHECK(cudaPeekAtLastError());
-
-        CUML_LOG_DEBUG("Rank %d: Finished Reduce", my_rank);
-      }
-
-      total_n_processed += cur_batch_size;
-    }
-
-    if (my_rank == part_rank) local_parts_completed++;
-  }
-};
-
-template void opg_knn<int>(raft::handle_t &handle,
-                           std::vector<Matrix::Data<int> *> *out,
-                           std::vector<Matrix::Data<int64_t> *> *out_I,
-                           std::vector<Matrix::floatData_t *> *out_D,
-                           std::vector<Matrix::floatData_t *> &idx_data,
-                           Matrix::PartDescriptor &idx_desc,
-                           std::vector<Matrix::floatData_t *> &query_data,
-                           Matrix::PartDescriptor &query_desc,
-                           std::vector<std::vector<int *>> &y,
-                           bool rowMajorIndex, bool rowMajorQuery, int k,
-                           int n_outputs, size_t batch_size, bool verbose,
-                           std::vector<std::vector<float *>> *probas,
-                           std::vector<int *> *uniq_labels,
-                           std::vector<int> *n_unique, bool probas_only);
-
-template void opg_knn<float>(raft::handle_t &handle,
-                             std::vector<Matrix::Data<float> *> *out,
-                             std::vector<Matrix::Data<int64_t> *> *out_I,
-                             std::vector<Matrix::floatData_t *> *out_D,
-                             std::vector<Matrix::floatData_t *> &idx_data,
-                             Matrix::PartDescriptor &idx_desc,
-                             std::vector<Matrix::floatData_t *> &query_data,
-                             Matrix::PartDescriptor &query_desc,
-                             std::vector<std::vector<float *>> &y,
-                             bool rowMajorIndex, bool rowMajorQuery, int k,
-                             int n_outputs, size_t batch_size, bool verbose,
-                             std::vector<std::vector<float *>> *probas,
-                             std::vector<int *> *uniq_labels,
-                             std::vector<int> *n_unique, bool probas_only);
-
-};  // namespace knn_common
-};  // namespace opg
-};  // namespace KNN
-};  // namespace ML
diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index c3356f14cb..f9e7c81b86 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -1,54 +1,20 @@
 /*
-* Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO LICENSEE:
-*
-* This source code and/or documentation ("Licensed Deliverables") are
-* subject to NVIDIA intellectual property rights under U.S. and
-* international Copyright laws.
-*
-* These Licensed Deliverables contained herein is PROPRIETARY and
-* CONFIDENTIAL to NVIDIA and is being provided under the terms and
-* conditions of a form of NVIDIA software license agreement by and
-* between NVIDIA and Licensee ("License Agreement") or electronically
-* accepted by Licensee.  Notwithstanding any terms or conditions to
-* the contrary in the License Agreement, reproduction or disclosure
-* of the Licensed Deliverables to any third party without the express
-* written consent of NVIDIA is prohibited.
-*
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
-* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THESE LICENSED DELIVERABLES.
-*
-* U.S. Government End Users.  These Licensed Deliverables are a
-* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-* 1995), consisting of "commercial computer software" and "commercial
-* computer software documentation" as such terms are used in 48
-* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
-* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-* U.S. Government End Users acquire the Licensed Deliverables with
-* only those rights set forth herein.
-*
-* Any use of the Licensed Deliverables in individual and commercial
-* software must include, in the user documentation and internal
-* comments to the code, the above Disclaimer and U.S. Government End
-* Users Notice.
-*/
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
-
 #include <cuml/neighbors/knn_mg.hpp>
 #include <selection/knn.cuh>
 
@@ -56,82 +22,895 @@
 
 #include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/logger.hpp>
 #include <raft/comms/comms.hpp>
 
+#include <memory>
 #include <set>
 
+#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
+#include "knn_opg_common.cuh"
+
 namespace ML {
 namespace KNN {
 namespace opg {
 
 namespace knn_common {
 
-template <typename T>
-void opg_knn(raft::handle_t &handle, std::vector<Matrix::Data<T> *> *out,
-             std::vector<Matrix::Data<int64_t> *> *out_I,
-             std::vector<Matrix::floatData_t *> *out_D,
-             std::vector<Matrix::floatData_t *> &idx_data,
-             Matrix::PartDescriptor &idx_desc,
-             std::vector<Matrix::floatData_t *> &query_data,
-             Matrix::PartDescriptor &query_desc,
-             std::vector<std::vector<T *>> &y, bool rowMajorIndex,
-             bool rowMajorQuery, int k, int n_outputs, size_t batch_size,
-             bool verbose, std::vector<std::vector<float *>> *probas = nullptr,
-             std::vector<int *> *uniq_labels = nullptr,
-             std::vector<int> *n_unique = nullptr, bool probas_only = false);
-
-template <typename T>
-void reduce(raft::handle_t &handle, std::vector<Matrix::Data<T> *> *out,
-            std::vector<Matrix::Data<int64_t> *> *out_I,
-            std::vector<Matrix::floatData_t *> *out_D, device_buffer<T> &res,
-            device_buffer<int64_t> &res_I, device_buffer<float> &res_D,
-            Matrix::PartDescriptor &index_desc, size_t cur_batch_size, int k,
-            int n_outputs, int local_parts_completed, int cur_batch,
-            size_t total_n_processed, std::set<int> idxRanks, int my_rank,
-            bool probas_only = false,
-            std::vector<std::vector<float *>> *probas = nullptr,
-            std::vector<int *> *uniq_labels = nullptr,
-            std::vector<int> *n_unique = nullptr);
-
-void broadcast_query(float *query, size_t batch_input_elms, int part_rank,
-                     std::set<int> idxRanks, const raft::comms::comms_t &comm,
-                     cudaStream_t stream);
-
-template <typename T>
-void exchange_results(device_buffer<T> &res, device_buffer<int64_t> &res_I,
-                      device_buffer<float> &res_D,
-                      const raft::comms::comms_t &comm, int part_rank,
-                      std::set<int> idxRanks, cudaStream_t stream,
-                      size_t cur_batch_size, int k, int n_outputs,
-                      int local_parts_completed);
-
-void perform_local_knn(int64_t *res_I, float *res_D,
-                       std::vector<Matrix::floatData_t *> &idx_data,
-                       Matrix::PartDescriptor &idx_desc,
-                       std::vector<Matrix::RankSizePair *> &local_idx_parts,
-                       std::vector<size_t> &start_indices, cudaStream_t stream,
-                       cudaStream_t *internal_streams, int n_internal_streams,
-                       std::shared_ptr<deviceAllocator> allocator,
-                       size_t cur_batch_size, int k, float *cur_query_ptr,
-                       bool rowMajorIndex, bool rowMajorQuery);
-
-template <typename T>
-void perform_local_operation(T *out, int64_t *knn_indices, T *labels,
-                             size_t cur_batch_size, int k, int n_outputs,
-                             raft::handle_t &h, bool probas_only = false,
-                             std::vector<float *> *probas = nullptr,
-                             std::vector<int *> *uniq_labels = nullptr,
-                             std::vector<int> *n_unique = nullptr);
-
-template <typename T>
-void copy_outputs(T *out, int64_t *knn_indices,
-                  std::vector<std::vector<T *>> &y, size_t cur_batch_size,
-                  int k, int n_outputs, int n_features, int my_rank,
-                  std::vector<Matrix::RankSizePair *> &idxPartsToRanks,
-                  std::shared_ptr<deviceAllocator> alloc, cudaStream_t stream);
+/**
+ * The enumeration of KNN distributed operations
+ */
+enum knn_operation {
+  knn,            /**< Simple KNN */
+  classification, /**< KNN classification */
+  class_proba,    /**< KNN classification probabilities */
+  regression      /**< KNN regression */
+};
+
+/**
+ * A structure to store parameters for distributed KNN
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct opg_knn_param {
+  opg_knn_param(knn_operation knn_op,
+                std::vector<Matrix::Data<in_t> *> *idx_data,
+                Matrix::PartDescriptor *idx_desc,
+                std::vector<Matrix::Data<in_t> *> *query_data,
+                Matrix::PartDescriptor *query_desc, bool rowMajorIndex,
+                bool rowMajorQuery, size_t k, size_t batch_size, bool verbose) {
+    this->knn_op = knn_op;
+    this->idx_data = idx_data;
+    this->idx_desc = idx_desc;
+    this->query_data = query_data;
+    this->query_desc = query_desc;
+    this->rowMajorIndex = rowMajorIndex;
+    this->rowMajorQuery = rowMajorQuery;
+    this->k = k;
+    this->batch_size = batch_size;
+    this->verbose = verbose;
+  }
+
+  knn_operation knn_op; /**< Type of KNN distributed operation */
+  std::vector<Matrix::Data<dist_t> *> *out_D =
+    nullptr; /**< KNN distances output array */
+  std::vector<Matrix::Data<ind_t> *> *out_I =
+    nullptr; /**< KNN indices output array */
+  std::vector<Matrix::Data<in_t> *> *idx_data =
+    nullptr; /**< Index input array */
+  Matrix::PartDescriptor *idx_desc =
+    nullptr; /**< Descriptor for index input array */
+  std::vector<Matrix::Data<in_t> *> *query_data =
+    nullptr; /**< Query input array */
+  Matrix::PartDescriptor *query_desc =
+    nullptr;             /**< Descriptor for query input array */
+  bool rowMajorIndex;    /**< Is index row major? */
+  bool rowMajorQuery;    /**< Is query row major? */
+  size_t k = 0;          /**< Number of nearest neighbors */
+  size_t batch_size = 0; /**< Batch size */
+  bool verbose;          /**< verbose */
+
+  int n_outputs = 0; /**< Number of outputs per query (cl&re) */
+  std::vector<std::vector<out_t *>> *y; /**< Labels input array (cl&re) */
+  std::vector<Matrix::Data<out_t> *>
+    *out; /**< KNN outputs output array (cl&re) */
+
+  std::vector<int> *n_unique =
+    nullptr; /**< Number of unique labels (classification) */
+  std::vector<out_t *> *uniq_labels =
+    nullptr; /**< Unique labels (classification) */
+  std::vector<std::vector<float *>> *probas =
+    nullptr; /**< KNN classification probabilities output array (class-probas) */
+};
+
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct KNN_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+  KNN_params(knn_operation knn_op, std::vector<Matrix::Data<in_t> *> *idx_data,
+             Matrix::PartDescriptor *idx_desc,
+             std::vector<Matrix::Data<in_t> *> *query_data,
+             Matrix::PartDescriptor *query_desc, bool rowMajorIndex,
+             bool rowMajorQuery, size_t k, size_t batch_size, bool verbose,
+             std::vector<Matrix::Data<dist_t> *> *out_D,
+             std::vector<Matrix::Data<ind_t> *> *out_I)
+    : opg_knn_param<in_t, ind_t, dist_t, out_t>(
+        knn_op, idx_data, idx_desc, query_data, query_desc, rowMajorIndex,
+        rowMajorQuery, k, batch_size, verbose) {
+    this->out_D = out_D;
+    this->out_I = out_I;
+  }
+};
+
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct KNN_RE_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+  KNN_RE_params(knn_operation knn_op,
+                std::vector<Matrix::Data<in_t> *> *idx_data,
+                Matrix::PartDescriptor *idx_desc,
+                std::vector<Matrix::Data<in_t> *> *query_data,
+                Matrix::PartDescriptor *query_desc, bool rowMajorIndex,
+                bool rowMajorQuery, size_t k, size_t batch_size, bool verbose,
+                int n_outputs, std::vector<std::vector<out_t *>> *y,
+                std::vector<Matrix::Data<out_t> *> *out)
+    : opg_knn_param<in_t, ind_t, dist_t, out_t>(
+        knn_op, idx_data, idx_desc, query_data, query_desc, rowMajorIndex,
+        rowMajorQuery, k, batch_size, verbose) {
+    this->n_outputs = n_outputs;
+    this->y = y;
+    this->out = out;
+  }
+};
+
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct KNN_CL_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+  KNN_CL_params(knn_operation knn_op,
+                std::vector<Matrix::Data<in_t> *> *idx_data,
+                Matrix::PartDescriptor *idx_desc,
+                std::vector<Matrix::Data<in_t> *> *query_data,
+                Matrix::PartDescriptor *query_desc, bool rowMajorIndex,
+                bool rowMajorQuery, size_t k, size_t batch_size, bool verbose,
+                int n_outputs, std::vector<std::vector<out_t *>> *y,
+                std::vector<int> *n_unique, std::vector<out_t *> *uniq_labels,
+                std::vector<Matrix::Data<out_t> *> *out,
+                std::vector<std::vector<float *>> *probas)
+    : opg_knn_param<in_t, ind_t, dist_t, out_t>(
+        knn_op, idx_data, idx_desc, query_data, query_desc, rowMajorIndex,
+        rowMajorQuery, k, batch_size, verbose) {
+    this->n_outputs = n_outputs;
+    this->y = y;
+    this->n_unique = n_unique;
+    this->uniq_labels = uniq_labels;
+    this->out = out;
+    this->probas = probas;
+  }
+};
+
+/**
+ * A structure to store utilities for CUDA and RAFT comms
+ */
+struct cuda_utils {
+  cuda_utils(raft::handle_t &handle) {
+    this->alloc = handle.get_device_allocator();
+    this->stream = handle.get_stream();
+    this->comm = &(handle.get_comms());  //communicator_ is a private attribute
+    size_t n_internal_streams = handle.get_num_internal_streams();
+    this->internal_streams.resize(n_internal_streams);
+    for (int i = 0; i < n_internal_streams; i++) {
+      internal_streams[i] = handle.get_internal_stream(i);
+    }
+  }
+  std::shared_ptr<deviceAllocator> alloc; /**< RMM alloc */
+  cudaStream_t stream;                    /**< CUDA user stream */
+  const raft::comms::comms_t *comm;       /**< RAFT comms handle */
+  std::vector<cudaStream_t>
+    internal_streams; /**< Vector of CUDA internal streams */
+};
+
+/**
+ * A structure to store utilities for distributed KNN operations
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct opg_knn_work {
+  opg_knn_work(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+               cuda_utils &cutils)
+    : res_D(cutils.alloc, cutils.stream),
+      res_I(cutils.alloc, cutils.stream),
+      res(cutils.alloc, cutils.stream) {
+    this->my_rank = cutils.comm->get_rank();
+    this->idxRanks = params.idx_desc->uniqueRanks();
+    this->idxPartsToRanks = params.idx_desc->partsToRanks;
+    this->local_idx_parts =
+      params.idx_desc->blocksOwnedBy(cutils.comm->get_rank());
+    this->queryPartsToRanks = params.query_desc->partsToRanks;
+  }
+
+  int my_rank;            /**< Rank of this worker */
+  std::set<int> idxRanks; /**< Set of ranks having at least 1 index partition */
+  std::vector<Matrix::RankSizePair *>
+    idxPartsToRanks; /**< Index parts to rank */
+  std::vector<Matrix::RankSizePair *>
+    local_idx_parts; /**< List of index parts stored locally */
+  std::vector<Matrix::RankSizePair *>
+    queryPartsToRanks; /**< Query parts to rank */
+
+  device_buffer<dist_t>
+    res_D;                    /**< Temporary allocation to exchange distances */
+  device_buffer<ind_t> res_I; /**< Temporary allocation to exchange indices */
+  device_buffer<out_t>
+    res; /**< Temporary allocation to exchange outputs (cl&re) */
+};
+
+/*!
+ Main function, computes distributed KNN operation
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void opg_knn(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+             cuda_utils &cutils) {
+  opg_knn_work<in_t, ind_t, dist_t, out_t> work(params, cutils);
+
+  ASSERT(params.k <= 1024, "k must be <= 1024");
+  ASSERT(params.batch_size > 0, "max_batch_size must be > 0");
+  ASSERT(params.k < params.idx_desc->M,
+         "k must be less than the total number of query rows");
+  for (Matrix::RankSizePair *rsp : work.idxPartsToRanks) {
+    ASSERT(rsp->size >= params.k,
+           "k must be <= the number of rows in the smallest index partition.");
+  }
+
+  int local_parts_completed = 0;
+  // Loop through query parts for all ranks
+  for (int i = 0; i < params.query_desc->totalBlocks();
+       i++) {  // For each query partitions
+    Matrix::RankSizePair *partition = work.queryPartsToRanks[i];
+    int part_rank = partition->rank;
+    size_t part_n_rows = partition->size;
+
+    size_t total_batches = raft::ceildiv(part_n_rows, params.batch_size);
+    size_t total_n_processed = 0;
+
+    // Loop through batches for each query part
+    for (int cur_batch = 0; cur_batch < total_batches;
+         cur_batch++) {  // For each batch in a query partition
+      size_t cur_batch_size = params.batch_size;
+
+      if (cur_batch == total_batches - 1)
+        cur_batch_size = part_n_rows - (cur_batch * params.batch_size);
+
+      if (work.my_rank == part_rank)
+        CUML_LOG_DEBUG("Root Rank is %d", work.my_rank);
+
+      /**
+        * Root broadcasts batch to all other ranks
+        */
+      CUML_LOG_DEBUG("Rank %d: Performing Broadcast", work.my_rank);
+
+      device_buffer<in_t> part_data(cutils.alloc, cutils.stream, 0);
+
+      size_t batch_input_elms = cur_batch_size * params.query_desc->N;
+      size_t batch_input_offset = batch_input_elms * cur_batch;
+
+      in_t *cur_query_ptr;
+
+      device_buffer<in_t> tmp_batch_buf(cutils.alloc, cutils.stream, 0);
+      // current partition's owner rank broadcasts
+      if (part_rank == work.my_rank) {
+        Matrix::Data<in_t> *data = params.query_data->at(local_parts_completed);
+
+        // If query is column major and total_batches > 0, create a
+        // temporary buffer for the batch so that we can stack rows.
+        if (!params.rowMajorQuery && total_batches > 1) {
+          tmp_batch_buf.resize(batch_input_elms, cutils.stream);
+          for (int col_data = 0; col_data < params.query_desc->N; col_data++) {
+            raft::copy(
+              tmp_batch_buf.data() + (col_data * cur_batch_size),
+              data->ptr + ((col_data * part_n_rows) + total_n_processed),
+              cur_batch_size, cutils.stream);
+          }
+          cur_query_ptr = tmp_batch_buf.data();
+
+        } else {
+          cur_query_ptr = data->ptr + batch_input_offset;
+        }
+
+        // all other (index) ranks receive
+      } else if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) {
+        part_data.resize(batch_input_elms, cutils.stream);
+        cur_query_ptr = part_data.data();
+      }
+
+      bool my_rank_is_idx =
+        work.idxRanks.find(work.my_rank) != work.idxRanks.end();
+
+      /**
+        * Send query to index partitions
+        */
+      if (work.my_rank == part_rank || my_rank_is_idx)
+        broadcast_query(work, cutils, part_rank, cur_query_ptr,
+                        batch_input_elms);
+
+      if (my_rank_is_idx) {
+        /**
+          * All index ranks perform local KNN
+          */
+        CUML_LOG_DEBUG("Rank %d: Performing Local KNN", work.my_rank);
+
+        size_t batch_knn_elms = params.k * cur_batch_size;
+
+        if (params.knn_op != knn_operation::knn) {
+          // No labels for KNN only operation
+          work.res.resize(batch_knn_elms * params.n_outputs, cutils.stream);
+        }
+        work.res_I.resize(batch_knn_elms, cutils.stream);
+        work.res_D.resize(batch_knn_elms, cutils.stream);
+
+        // Perform a local KNN search
+        perform_local_knn(params, work, cutils, cur_query_ptr, cur_batch_size);
+
+        if (params.knn_op != knn_operation::knn) {
+          // Get the right labels for indices obtained after a KNN merge
+          copy_label_outputs_from_index_parts(params, work, cutils,
+                                              cur_batch_size);
+        }
+      }
+
+      if (part_rank == work.my_rank || my_rank_is_idx) {
+        /**
+          * Ranks exchange results.
+          * Each rank having index partition(s) sends
+          * its local results (my_rank_is_idx)
+          * Additionally the owner of currently processed query partition
+          * receives and performs a reduce even if it has
+          * no index partition (part_rank == my_rank)
+          */
+        CUML_LOG_DEBUG("Rank %d: Exchanging results", work.my_rank);
+        exchange_results(params, work, cutils, part_rank, cur_batch_size);
+      }
+
+      /**
+        * Root rank performs local reduce
+        */
+      if (part_rank == work.my_rank) {
+        CUML_LOG_DEBUG("Rank %d: Performing Reduce", work.my_rank);
+
+        // Reduce all local results to a global result for a given query batch
+        reduce(params, work, cutils, local_parts_completed, total_n_processed,
+               cur_batch_size);
+
+        CUML_LOG_DEBUG("Rank %d: Finished Reduce", work.my_rank);
+      }
+
+      total_n_processed += cur_batch_size;
+    }
+
+    if (work.my_rank == part_rank) local_parts_completed++;
+  }
+};
+
+/*!
+ Broadcast query batch accross all the workers
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[in] part_rank Rank of currently processed query batch
+ @param[in] broadcast Pointer to broadcast
+ @param[in] broadcast_size Size of broadcast
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void broadcast_query(opg_knn_work<in_t, ind_t, dist_t, out_t> &work,
+                     cuda_utils &cutils, int part_rank, in_t *broadcast,
+                     size_t broadcast_size) {
+  int request_idx = 0;
+  std::vector<raft::comms::request_t> requests;
+  if (part_rank == work.my_rank) {  // Either broadcast to other workers
+    int idx_rank_size = work.idxRanks.size();
+    if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) {
+      --idx_rank_size;
+    }
+
+    requests.resize(idx_rank_size);
+
+    for (int rank : work.idxRanks) {
+      if (rank != work.my_rank) {
+        cutils.comm->isend(broadcast, broadcast_size, rank, 0,
+                           requests.data() + request_idx);
+        ++request_idx;
+      }
+    }
+
+  } else {  // Or receive from broadcaster
+    requests.resize(1);
+    cutils.comm->irecv(broadcast, broadcast_size, part_rank, 0,
+                       requests.data() + request_idx);
+    ++request_idx;
+  }
+
+  try {
+    cutils.comm->waitall(requests.size(), requests.data());
+  } catch (raft::exception &e) {
+    CUML_LOG_DEBUG("FAILURE!");
+  }
+}
+
+/*!
+ Perform a local KNN search for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[in] query Pointer to query
+ @param[in] query_size Size of query
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void perform_local_knn(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+                       opg_knn_work<in_t, ind_t, dist_t, out_t> &work,
+                       cuda_utils &cutils, in_t *query, size_t query_size) {
+  std::vector<in_t *> ptrs(params.idx_data->size());
+  std::vector<int> sizes(params.idx_data->size());
+
+  for (int cur_idx = 0; cur_idx < params.idx_data->size(); cur_idx++) {
+    ptrs[cur_idx] = params.idx_data->at(cur_idx)->ptr;
+    sizes[cur_idx] = work.local_idx_parts[cur_idx]->size;
+  }
+
+  // Offset nearest neighbor index matrix by partition indices
+  std::vector<size_t> start_indices =
+    params.idx_desc->startIndices(work.my_rank);
+  // PartDescriptor uses size_t while FAISS uses int64_t
+  // so we need to do a quick conversion.
+  std::vector<int64_t> start_indices_long;
+  for (size_t start_index : start_indices)
+    start_indices_long.push_back((int64_t)start_index);
+
+  // ID ranges need to be offset by each local partition's
+  // starting indices.
+  MLCommon::Selection::brute_force_knn(
+    ptrs, sizes, params.idx_desc->N, query, query_size, work.res_I.data(),
+    work.res_D.data(), params.k, cutils.alloc, cutils.stream,
+    cutils.internal_streams.data(), cutils.internal_streams.size(),
+    params.rowMajorIndex, params.rowMajorQuery, &start_indices_long);
+  CUDA_CHECK(cudaStreamSynchronize(cutils.stream));
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * This function copies the labels associated to the locally merged indices
+ * from the index partitions to a merged array of labels
+ * @param[out] out merged labels
+ * @param[in] knn_indices merged indices
+ * @param[in] parts unmerged labels in partitions
+ * @param[in] offsets array splitting the partitions making it possible
+ * to identify the origin partition of an nearest neighbor index
+ * @param[in] cur_batch_size current batch size
+ * @param[in] n_parts number of partitions
+ * @param[in] n_labels number of labels to write (batch_size * n_outputs)
+ */
+template <int TPB_X, typename ind_t, typename out_t>
+__global__ void copy_label_outputs_from_index_parts_kernel(
+  out_t *out, ind_t *knn_indices, out_t **parts, uint64_t *offsets,
+  size_t cur_batch_size, int n_parts, int n_labels) {
+  uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (i >= n_labels) return;
+  uint64_t nn_idx = knn_indices[i];
+  int part_idx = 0;
+  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++)
+    ;
+  part_idx = min(max((int)0, part_idx - 1), n_parts - 1);
+  uint64_t offset = nn_idx - offsets[part_idx];
+  out[i] = parts[part_idx][offset];
+}
+
+/*!
+ Get the right labels for indices obtained after a KNN merge
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[in] batch_size Batch size
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void copy_label_outputs_from_index_parts(
+  opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+  opg_knn_work<in_t, ind_t, dist_t, out_t> &work, cuda_utils &cutils,
+  size_t batch_size) {
+  const int TPB_X = 256;
+  int n_labels = batch_size * params.k;
+  dim3 grid(raft::ceildiv(n_labels, TPB_X));
+  dim3 blk(TPB_X);
+
+  uint64_t offset = 0;
+  std::vector<uint64_t> offsets_h;
+  for (auto &rsp : work.idxPartsToRanks) {
+    if (rsp->rank == work.my_rank) {
+      offsets_h.push_back(offset);
+    }
+    offset += rsp->size;
+  }
+  uint64_t n_parts = offsets_h.size();
+  device_buffer<uint64_t> offsets_d(cutils.alloc, cutils.stream, n_parts);
+  raft::update_device(offsets_d.data(), offsets_h.data(), n_parts,
+                      cutils.stream);
+
+  std::vector<out_t *> parts_h(n_parts);
+  device_buffer<out_t *> parts_d(cutils.alloc, cutils.stream, n_parts);
+  for (int o = 0; o < params.n_outputs; o++) {
+    for (int p = 0; p < n_parts; p++) {
+      parts_h[p] = params.y->at(p)[o];
+    }
+    raft::update_device(parts_d.data(), parts_h.data(), n_parts, cutils.stream);
+
+    copy_label_outputs_from_index_parts_kernel<TPB_X, ind_t, out_t>
+      <<<grid, blk, 0, cutils.stream>>>(
+        work.res.data() + (o * n_labels), work.res_I.data(), parts_d.data(),
+        offsets_d.data(), batch_size, n_parts, n_labels);
+  }
+  CUDA_CHECK(cudaStreamSynchronize(cutils.stream));
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/*!
+ Exchange results of local KNN search and operation for a given query batch
+ All non-root index ranks send the results for the current
+ query batch to the root rank for the batch.
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[in] part_rank Rank of currently processed query batch
+ @param[in] batch_size Batch size
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void exchange_results(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+                      opg_knn_work<in_t, ind_t, dist_t, out_t> &work,
+                      cuda_utils &cutils, int part_rank, size_t batch_size) {
+  size_t batch_elms = batch_size * params.k;
+
+  int request_idx = 0;
+  std::vector<raft::comms::request_t> requests;
+  if (part_rank != work.my_rank) {  // Either send local KNN results
+    requests.resize(2);
+    cutils.comm->isend(work.res_I.data(), batch_elms, part_rank, 0,
+                       requests.data() + request_idx);
+    ++request_idx;
+
+    cutils.comm->isend(work.res_D.data(), batch_elms, part_rank, 0,
+                       requests.data() + request_idx);
+    ++request_idx;
+
+    if (params.knn_op != knn_operation::knn) {
+      requests.resize(2 + params.n_outputs);
+      for (size_t o = 0; o < params.n_outputs; o++) {
+        cutils.comm->isend(work.res.data() + (o * batch_elms), batch_elms,
+                           part_rank, 0, requests.data() + request_idx);
+        ++request_idx;
+      }
+    }
+  } else {  // Or, as the owner of currently processed query batch,
+            // receive results from other workers for reduce
+    bool part_rank_is_idx =
+      work.idxRanks.find(part_rank) != work.idxRanks.end();
+    size_t idx_rank_size = work.idxRanks.size();
+
+    // if root rank is an index, it will already have
+    // query data, so no need to receive from it.
+    work.res_I.resize(batch_elms * idx_rank_size, cutils.stream);
+    work.res_D.resize(batch_elms * idx_rank_size, cutils.stream);
+
+    if (params.knn_op != knn_operation::knn) {
+      work.res.resize(batch_elms * params.n_outputs * idx_rank_size,
+                      cutils.stream);
+    }
+
+    if (part_rank_is_idx) {
+      /**
+       * If this worker (in charge of reduce),
+       * has some local results as well,
+       * copy them at right location
+       */
+      --idx_rank_size;
+      int i = 0;
+      for (int rank : work.idxRanks) {
+        if (rank == work.my_rank) {
+          size_t batch_offset = batch_elms * i;
+
+          // Indices and distances are stored in rank order
+          raft::copy_async(work.res_I.data() + batch_offset, work.res_I.data(),
+                           batch_elms, cutils.stream);
+          raft::copy_async(work.res_D.data() + batch_offset, work.res_D.data(),
+                           batch_elms, cutils.stream);
+
+          if (params.knn_op != knn_operation::knn) {
+            device_buffer<out_t> tmp_res(cutils.alloc, cutils.stream,
+                                         params.n_outputs * batch_elms);
+            raft::copy_async(tmp_res.data(), work.res.data(), tmp_res.size(),
+                             cutils.stream);
+
+            for (int o = 0; o < params.n_outputs; ++o) {
+              // Outputs are stored in target order and then in rank order
+              raft::copy_async(
+                work.res.data() + (o * work.idxRanks.size() * batch_elms) +
+                  batch_offset,
+                tmp_res.data() + (o * batch_elms), batch_elms, cutils.stream);
+            }
+          }
+          CUDA_CHECK(cudaStreamSynchronize(cutils.stream));
+          break;
+        }
+        i++;
+      }
+    }
+
+    size_t request_size = 2 * idx_rank_size;
+    if (params.knn_op != knn_operation::knn)
+      request_size = (2 + params.n_outputs) * idx_rank_size;
+    requests.resize(request_size);
+
+    int num_received = 0;
+    for (int rank : work.idxRanks) {
+      if (rank != work.my_rank) {
+        size_t batch_offset = batch_elms * num_received;
+
+        // Indices and distances are stored in rank order
+        cutils.comm->irecv(work.res_I.data() + batch_offset, batch_elms, rank,
+                           0, requests.data() + request_idx);
+        ++request_idx;
+        cutils.comm->irecv(work.res_D.data() + batch_offset, batch_elms, rank,
+                           0, requests.data() + request_idx);
+        ++request_idx;
+
+        if (params.knn_op != knn_operation::knn) {
+          for (size_t o = 0; o < params.n_outputs; o++) {
+            // Outputs are stored in target order and then in rank order
+            out_t *r = work.res.data() +
+                       (o * work.idxRanks.size() * batch_elms) + batch_offset;
+            cutils.comm->irecv(r, batch_elms, rank, 0,
+                               requests.data() + request_idx);
+            ++request_idx;
+          }
+        }
+      }
+      if (rank != work.my_rank || part_rank_is_idx) {
+        /**
+          * Increase index for each new reception
+          * Also increase index when the worker doing a reduce operation
+          * has some index data (previously copied at right location).
+          */
+        ++num_received;
+      }
+    }
+  }
+
+  try {
+    cutils.comm->waitall(requests.size(), requests.data());
+  } catch (raft::exception &e) {
+    CUML_LOG_DEBUG("FAILURE!");
+  }
+}
+
+/*!
+ Reduce all local results to a global result for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[in] part_idx Partition index of query batch
+ @param[in] processed_in_part Number of queries already processed in part (serves as offset)
+ @param[in] batch_size Batch size
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t,
+          typename trans_t = int64_t>
+void reduce(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+            opg_knn_work<in_t, ind_t, dist_t, out_t> &work, cuda_utils &cutils,
+            int part_idx, size_t processed_in_part, size_t batch_size) {
+  device_buffer<trans_t> trans(cutils.alloc, cutils.stream,
+                               work.idxRanks.size());
+  CUDA_CHECK(cudaMemsetAsync(
+    trans.data(), 0, work.idxRanks.size() * sizeof(trans_t), cutils.stream));
+
+  size_t batch_offset = processed_in_part * params.k;
+
+  ind_t *indices = nullptr;
+  dist_t *distances = nullptr;
+
+  device_buffer<ind_t> indices_b(cutils.alloc, cutils.stream);
+  device_buffer<dist_t> distances_b(cutils.alloc, cutils.stream);
+
+  if (params.knn_op == knn_operation::knn) {
+    indices = params.out_I->at(part_idx)->ptr + batch_offset;
+    distances = params.out_D->at(part_idx)->ptr + batch_offset;
+  } else {
+    indices_b.resize(batch_size * params.k);
+    distances_b.resize(batch_size * params.k);
+    indices = indices_b.data();
+    distances = distances_b.data();
+  }
+
+  // Merge all KNN local results
+  MLCommon::Selection::knn_merge_parts(
+    work.res_D.data(), work.res_I.data(), distances, indices, batch_size,
+    work.idxRanks.size(), params.k, cutils.stream, trans.data());
+  CUDA_CHECK(cudaStreamSynchronize(cutils.stream));
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  if (params.knn_op != knn_operation::knn) {
+    device_buffer<out_t> merged_outputs_b(
+      cutils.alloc, cutils.stream, params.n_outputs * batch_size * params.k);
+    // Get the right labels for indices obtained after local KNN searches
+    merge_labels(params, work, cutils, merged_outputs_b.data(), indices,
+                 work.res.data(), work.res_I.data(), batch_size);
+
+    out_t *outputs = nullptr;
+    std::vector<float *> probas_with_offsets;
+
+    if (params.knn_op != knn_operation::class_proba) {
+      outputs =
+        params.out->at(part_idx)->ptr + (processed_in_part * params.n_outputs);
+    } else {
+      std::vector<float *> &probas_part = params.probas->at(part_idx);
+      for (int i = 0; i < params.n_outputs; i++) {
+        float *ptr = probas_part[i];
+        int n_unique_classes = params.n_unique->at(i);
+        probas_with_offsets.push_back(ptr +
+                                      (processed_in_part * n_unique_classes));
+      }
+    }
+
+    // Perform final classification, regression or class-proba operation
+    perform_local_operation(params, work, cutils, outputs, probas_with_offsets,
+                            merged_outputs_b.data(), batch_size);
+
+    CUDA_CHECK(cudaStreamSynchronize(cutils.stream));
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
+}
+
+/**
+ * This function copies the labels associated to the merged indices
+ * from the unmerged to a merged (n_ranks times smaller) array of labels
+ * @param[out] outputs merged labels
+ * @param[in] knn_indices merged indices
+ * @param[in] unmerged_outputs unmerged labels
+ * @param[in] unmerged_knn_indices unmerged indices
+ * @param[in] offsets array splitting the partitions making it possible
+ * to identify the origin partition of an nearest neighbor index
+ * @param[in] parts_to_ranks get rank index from index partition index,
+ * informative to find positions as the unmerged arrays are built
+ * so that ranks are in order (unlike partitions)
+ * @param[in] nearest_neighbors number of nearest neighbors to look for in query
+ * @param[in] n_outputs number of targets
+ * @param[in] n_labels number of labels to write (batch_size * n_outputs)
+ * @param[in] n_parts number of index partitions
+ * @param[in] n_ranks number of index ranks
+ */
+template <int TPB_X, typename dist_t, typename out_t>
+__global__ void merge_labels_kernel(out_t *outputs, dist_t *knn_indices,
+                                    out_t *unmerged_outputs,
+                                    dist_t *unmerged_knn_indices,
+                                    size_t *offsets, int *parts_to_ranks,
+                                    int nearest_neighbors, int n_outputs,
+                                    int n_labels, int n_parts, int n_ranks) {
+  uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (i >= n_labels) return;
+  uint64_t nn_idx = knn_indices[i];
+  int part_idx = 0;
+  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++)
+    ;
+  part_idx = min(max((int)0, part_idx - 1), n_parts - 1);
+  int rank_idx = parts_to_ranks[part_idx];
+  int inbatch_idx = i / nearest_neighbors;
+  uint64_t elm_idx = (rank_idx * n_labels) + inbatch_idx * nearest_neighbors;
+  for (int k = 0; k < nearest_neighbors; k++) {
+    if (nn_idx == unmerged_knn_indices[elm_idx + k]) {
+      for (int o = 0; o < n_outputs; o++) {
+        outputs[(o * n_labels) + i] =
+          unmerged_outputs[(o * n_ranks * n_labels) + elm_idx + k];
+      }
+      return;
+    }
+  }
+}
+
+/*!
+ Get the right labels for indices obtained after local KNN searches
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[out] output KNN outputs output array
+ @param[out] knn_indices KNN class-probas output array (class-proba only)
+ @param[in] unmerged_outputs KNN labels input array
+ @param[in] unmerged_knn_indices Batch size
+ @param[in] batch_size Batch size
+ */
+template <typename opg_knn_param_t, typename opg_knn_work_t, typename ind_t,
+          typename out_t>
+void merge_labels(opg_knn_param_t &params, opg_knn_work_t &work,
+                  cuda_utils &cutils, out_t *output, ind_t *knn_indices,
+                  out_t *unmerged_outputs, ind_t *unmerged_knn_indices,
+                  int batch_size) {
+  const int TPB_X = 256;
+  int n_labels = batch_size * params.k;
+  dim3 grid(raft::ceildiv(n_labels, TPB_X));
+  dim3 blk(TPB_X);
+
+  int offset = 0;
+  std::vector<uint64_t> offsets_h;
+  for (auto &rsp : work.idxPartsToRanks) {
+    offsets_h.push_back(offset);
+    offset += rsp->size;
+  }
+  device_buffer<uint64_t> offsets_d(cutils.alloc, cutils.stream,
+                                    offsets_h.size());
+  raft::update_device(offsets_d.data(), offsets_h.data(), offsets_h.size(),
+                      cutils.stream);
+
+  std::vector<int> parts_to_ranks_h;
+  for (auto &rsp : work.idxPartsToRanks) {
+    int i = 0;
+    for (int rank : work.idxRanks) {
+      if (rank == rsp->rank) {
+        parts_to_ranks_h.push_back(i);
+      }
+      ++i;
+    }
+  }
+  device_buffer<int> parts_to_ranks_d(cutils.alloc, cutils.stream,
+                                      parts_to_ranks_h.size());
+  raft::update_device(parts_to_ranks_d.data(), parts_to_ranks_h.data(),
+                      parts_to_ranks_h.size(), cutils.stream);
+
+  merge_labels_kernel<TPB_X><<<grid, blk, 0, cutils.stream>>>(
+    output, knn_indices, unmerged_outputs, unmerged_knn_indices,
+    offsets_d.data(), parts_to_ranks_d.data(), params.k, params.n_outputs,
+    n_labels, work.idxPartsToRanks.size(), work.idxRanks.size());
+}
+
+/*!
+ Perform final classification, regression or class-proba operation for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[out] outputs KNN outputs output array
+ @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
+ @param[in] labels KNN labels input array
+ @param[in] batch_size Batch size
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t,
+          typename std::enable_if<std::is_floating_point<out_t>::value>::type
+            * = nullptr>
+void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+                             opg_knn_work<in_t, ind_t, dist_t, out_t> &work,
+                             cuda_utils &cutils, out_t *outputs,
+                             std::vector<float *> &probas_with_offsets,
+                             out_t *labels, size_t batch_size) {
+  size_t n_labels = batch_size * params.k;
+  std::vector<out_t *> y(params.n_outputs);
+  for (int o = 0; o < params.n_outputs; o++) {
+    y[o] = reinterpret_cast<out_t *>(labels) + (o * n_labels);
+  }
+
+  MLCommon::Selection::knn_regress<float, 32, true>(
+    outputs, nullptr, y, n_labels, batch_size, params.k, cutils.stream,
+    cutils.internal_streams.data(), cutils.internal_streams.size());
+}
+
+/*!
+ Perform final classification, regression or class-proba operation for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] cutils Utilities for CUDA and RAFT comms
+ @param[out] outputs KNN outputs output array
+ @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
+ @param[in] labels KNN labels input array
+ @param[in] batch_size Batch size
+ */
+template <
+  typename in_t, typename ind_t, typename dist_t, typename out_t,
+  typename std::enable_if<std::is_integral<out_t>::value>::type * = nullptr>
+void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
+                             opg_knn_work<in_t, ind_t, dist_t, out_t> &work,
+                             cuda_utils &cutils, out_t *outputs,
+                             std::vector<float *> &probas_with_offsets,
+                             out_t *labels, size_t batch_size) {
+  size_t n_labels = batch_size * params.k;
+  std::vector<out_t *> y(params.n_outputs);
+  for (int o = 0; o < params.n_outputs; o++) {
+    y[o] = reinterpret_cast<out_t *>(labels) + (o * n_labels);
+  }
+
+  switch (params.knn_op) {
+    case knn_operation::classification:
+      MLCommon::Selection::knn_classify<32, true>(
+        outputs, nullptr, y, n_labels, batch_size, params.k,
+        *(params.uniq_labels), *(params.n_unique), cutils.alloc, cutils.stream,
+        cutils.internal_streams.data(), cutils.internal_streams.size());
+      break;
+    case knn_operation::class_proba:
+      MLCommon::Selection::class_probs<32, true>(
+        probas_with_offsets, nullptr, y, n_labels, batch_size, params.k,
+        *(params.uniq_labels), *(params.n_unique), cutils.alloc, cutils.stream,
+        cutils.internal_streams.data(), cutils.internal_streams.size());
+      break;
+    default:
+      CUML_LOG_DEBUG("FAILURE!");
+  }
+}
+
 };  // namespace knn_common
 };  // namespace opg
 };  // namespace KNN
-};  // namespace ML
\ No newline at end of file
+};  // namespace ML
diff --git a/cpp/src/knn/knn_regress_mg.cu b/cpp/src/knn/knn_regress_mg.cu
index 9f0749e73e..efcf274ac4 100644
--- a/cpp/src/knn/knn_regress_mg.cu
+++ b/cpp/src/knn/knn_regress_mg.cu
@@ -1,51 +1,18 @@
 /*
-* Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO LICENSEE:
-*
-* This source code and/or documentation ("Licensed Deliverables") are
-* subject to NVIDIA intellectual property rights under U.S. and
-* international Copyright laws.
-*
-* These Licensed Deliverables contained herein is PROPRIETARY and
-* CONFIDENTIAL to NVIDIA and is being provided under the terms and
-* conditions of a form of NVIDIA software license agreement by and
-* between NVIDIA and Licensee ("License Agreement") or electronically
-* accepted by Licensee.  Notwithstanding any terms or conditions to
-* the contrary in the License Agreement, reproduction or disclosure
-* of the Licensed Deliverables to any third party without the express
-* written consent of NVIDIA is prohibited.
-*
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
-* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
-* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
-* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
-* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
-* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
-* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
-* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
-* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
-* OF THESE LICENSED DELIVERABLES.
-*
-* U.S. Government End Users.  These Licensed Deliverables are a
-* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
-* 1995), consisting of "commercial computer software" and "commercial
-* computer software documentation" as such terms are used in 48
-* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
-* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
-* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
-* U.S. Government End Users acquire the Licensed Deliverables with
-* only those rights set forth herein.
-*
-* Any use of the Licensed Deliverables in individual and commercial
-* software must include, in the user documentation and internal
-* comments to the code, the above Disclaimer and U.S. Government End
-* Users Notice.
-*/
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include "knn_opg_common.cuh"
 
@@ -55,10 +22,10 @@ namespace opg {
 
 using namespace knn_common;
 
+template struct KNN_RE_params<float, int64_t, float, float>;
+
 void knn_regress(raft::handle_t &handle,
                  std::vector<Matrix::Data<float> *> *out,
-                 std::vector<Matrix::Data<int64_t> *> *out_I,
-                 std::vector<Matrix::floatData_t *> *out_D,
                  std::vector<Matrix::floatData_t *> &idx_data,
                  Matrix::PartDescriptor &idx_desc,
                  std::vector<Matrix::floatData_t *> &query_data,
@@ -66,8 +33,12 @@ void knn_regress(raft::handle_t &handle,
                  std::vector<std::vector<float *>> &y, bool rowMajorIndex,
                  bool rowMajorQuery, int k, int n_outputs, size_t batch_size,
                  bool verbose) {
-  opg_knn(handle, out, out_I, out_D, idx_data, idx_desc, query_data, query_desc,
-          y, rowMajorIndex, rowMajorQuery, k, n_outputs, batch_size, verbose);
+  KNN_RE_params<float, int64_t, float, float> params(
+    knn_operation::regression, &idx_data, &idx_desc, &query_data, &query_desc,
+    rowMajorIndex, rowMajorQuery, k, batch_size, verbose, n_outputs, &y, out);
+
+  cuda_utils cutils(handle);
+  opg_knn(params, cutils);
 }
 };  // namespace opg
 };  // namespace KNN
diff --git a/python/cuml/dask/neighbors/kneighbors_classifier.py b/python/cuml/dask/neighbors/kneighbors_classifier.py
index f9a4fd4a96..ad37817350 100644
--- a/python/cuml/dask/neighbors/kneighbors_classifier.py
+++ b/python/cuml/dask/neighbors/kneighbors_classifier.py
@@ -28,12 +28,6 @@
 import numpy as np
 
 
-def _custom_getter(o):
-    def func_get(f, idx):
-        return f[o][idx]
-    return func_get
-
-
 class KNeighborsClassifier(NearestNeighbors):
     """
     Multi-node Multi-GPU K-Nearest Neighbors Classifier Model.
@@ -92,6 +86,9 @@ def fit(self, X, y):
             DistributedDataHandler.create(data=[X, y],
                                           client=self.client)
 
+        # Compute set of possible labels for each output column -> uniq_labels
+        # Count possible labels for each columns -> n_unique
+
         uniq_labels = []
         if self.data_handler.datatype == 'cupy':
             if y.ndim == 1:
@@ -128,19 +125,19 @@ def _func_create_model(sessionId, **kwargs):
         return cumlKNN(handle=handle, **kwargs)
 
     @staticmethod
-    def _func_predict(model, data, data_parts_to_ranks, data_nrows,
+    def _func_predict(model, index, index_parts_to_ranks, index_nrows,
                       query, query_parts_to_ranks, query_nrows,
                       uniq_labels, n_unique, ncols, rank, convert_dtype,
                       probas_only):
         if probas_only:
             return model.predict_proba(
-                data, data_parts_to_ranks, data_nrows,
+                index, index_parts_to_ranks, index_nrows,
                 query, query_parts_to_ranks, query_nrows,
                 uniq_labels, n_unique, ncols, rank, convert_dtype
             )
         else:
             return model.predict(
-                data, data_parts_to_ranks, data_nrows,
+                index, index_parts_to_ranks, index_nrows,
                 query, query_parts_to_ranks, query_nrows,
                 uniq_labels, n_unique, ncols, rank, convert_dtype
             )
@@ -237,8 +234,7 @@ def predict(self, X, convert_dtype=True):
         """
         out_futures = flatten_grouped_results(self.client,
                                               query_parts_to_ranks,
-                                              knn_clf_res,
-                                              getter_func=_custom_getter(0))
+                                              knn_clf_res)
         comms.destroy()
 
         return to_output(out_futures, self.datatype).squeeze()
@@ -360,6 +356,11 @@ def predict_proba(self, X, convert_dtype=True):
 
         n_outputs = len(self.n_unique)
 
+        def _custom_getter(o):
+            def func_get(f, idx):
+                return f[o][idx]
+            return func_get
+
         """
         Gather resulting partitions and return result
         """
diff --git a/python/cuml/dask/neighbors/kneighbors_regressor.py b/python/cuml/dask/neighbors/kneighbors_regressor.py
index a6b05c4883..99ef2cffed 100644
--- a/python/cuml/dask/neighbors/kneighbors_regressor.py
+++ b/python/cuml/dask/neighbors/kneighbors_regressor.py
@@ -26,12 +26,6 @@
 from uuid import uuid1
 
 
-def _custom_getter(o):
-    def func_get(f, idx):
-        return f[o][idx]
-    return func_get
-
-
 class KNeighborsRegressor(NearestNeighbors):
     """
     Multi-node Multi-GPU K-Nearest Neighbors Regressor Model.
@@ -105,11 +99,11 @@ def _func_create_model(sessionId, **kwargs):
         return cumlKNN(handle=handle, **kwargs)
 
     @staticmethod
-    def _func_predict(model, data, data_parts_to_ranks, data_nrows,
+    def _func_predict(model, index, index_parts_to_ranks, index_nrows,
                       query, query_parts_to_ranks, query_nrows,
                       ncols, rank, n_output, convert_dtype):
         return model.predict(
-            data, data_parts_to_ranks, data_nrows,
+            index, index_parts_to_ranks, index_nrows,
             query, query_parts_to_ranks, query_nrows,
             ncols, rank, n_output, convert_dtype
         )
@@ -204,8 +198,7 @@ def predict(self, X, convert_dtype=True):
         """
         out_futures = flatten_grouped_results(self.client,
                                               query_parts_to_ranks,
-                                              knn_reg_res,
-                                              getter_func=_custom_getter(0))
+                                              knn_reg_res)
 
         comms.destroy()
 
diff --git a/python/cuml/dask/neighbors/nearest_neighbors.py b/python/cuml/dask/neighbors/nearest_neighbors.py
index 90c0a9c5e2..b9ab3f4d9a 100644
--- a/python/cuml/dask/neighbors/nearest_neighbors.py
+++ b/python/cuml/dask/neighbors/nearest_neighbors.py
@@ -27,16 +27,6 @@
 from uuid import uuid1
 
 
-def _func_get_d(f, idx):
-    i, d = f
-    return d[idx]
-
-
-def _func_get_i(f, idx):
-    i, d = f
-    return i[idx]
-
-
 class NearestNeighbors(BaseEstimator):
     """
     Multi-node Multi-GPU NearestNeighbors Model.
@@ -106,14 +96,13 @@ def _func_create_model(sessionId, **kwargs):
         return cumlNN(handle=handle, **kwargs)
 
     @staticmethod
-    def _func_kneighbors(model, local_idx_parts, idx_m, n, idx_parts_to_ranks,
-                         local_query_parts, query_m, query_parts_to_ranks,
-                         rank, k):
-
+    def _func_kneighbors(model, index, index_parts_to_ranks, index_nrows,
+                         query, query_parts_to_ranks, query_nrows,
+                         ncols, rank, n_neighbors, convert_dtype):
         return model.kneighbors(
-            local_idx_parts, idx_m, n, idx_parts_to_ranks,
-            local_query_parts, query_m, query_parts_to_ranks,
-            rank, k
+            index, index_parts_to_ranks, index_nrows, query,
+            query_parts_to_ranks, query_nrows, ncols, rank,
+            n_neighbors, convert_dtype
         )
 
     @staticmethod
@@ -202,33 +191,39 @@ def _query_models(self, n_neighbors,
                         nn_models[worker],
                         index_handler.worker_to_parts[worker] if
                         worker in index_handler.workers else [],
-                        index_handler.total_rows,
-                        self.n_cols,
                         idx_parts_to_ranks,
+                        index_handler.total_rows,
                         query_handler.worker_to_parts[worker] if
                         worker in query_handler.workers else [],
-                        query_handler.total_rows,
                         query_parts_to_ranks,
+                        query_handler.total_rows,
+                        self.n_cols,
                         worker_info[worker]["rank"],
                         n_neighbors,
+                        False,
                         key="%s-%s" % (key, idx),
                         workers=[worker]))
                        for idx, worker in enumerate(comms.worker_addresses)])
 
         wait_and_raise_from_futures(list(nn_fit.values()))
 
+        def _custom_getter(o):
+            def func_get(f, idx):
+                return f[o][idx]
+            return func_get
+
         """
         Gather resulting partitions and return dask_cudfs
         """
         out_d_futures = flatten_grouped_results(self.client,
                                                 query_parts_to_ranks,
                                                 nn_fit,
-                                                getter_func=_func_get_d)
+                                                getter_func=_custom_getter(0))
 
         out_i_futures = flatten_grouped_results(self.client,
                                                 query_parts_to_ranks,
                                                 nn_fit,
-                                                getter_func=_func_get_i)
+                                                getter_func=_custom_getter(1))
 
         return nn_fit, out_d_futures, out_i_futures
 
diff --git a/python/cuml/neighbors/kneighbors_classifier_mg.pyx b/python/cuml/neighbors/kneighbors_classifier_mg.pyx
index cb006602c7..b0dfe7a0eb 100644
--- a/python/cuml/neighbors/kneighbors_classifier_mg.pyx
+++ b/python/cuml/neighbors/kneighbors_classifier_mg.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,25 +17,22 @@
 # distutils: language = c++
 
 import typing
+from cuml.common.array import CumlArray
+import cuml.common.logger as logger
+from cuml.internals import api_base_return_generic_skipall
 
-import numpy as np
+from cuml.neighbors.nearest_neighbors_mg import NearestNeighborsMG
 
-import cuml.internals
-from cuml.common.array import CumlArray
 from cuml.raft.common.handle cimport handle_t
-from cuml.common import input_to_cuml_array
 from cuml.common.opg_data_utils_mg cimport *
-from cuml.common.opg_data_utils_mg import _build_part_inputs
+from cuml.common import input_to_cuml_array
 
-import rmm
-from libc.stdlib cimport calloc, malloc, free
-from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+from libc.stdlib cimport free
 
-from cuml.neighbors.kneighbors_mg import KNeighborsMG
-from cudf.core import DataFrame as cudfDataFrame
 
 cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
         "ML::KNN::opg":
@@ -43,8 +40,6 @@ cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
     cdef void knn_classify(
         handle_t &handle,
         vector[intData_t*] *out,
-        vector[int64Data_t*] *out_I,
-        vector[floatData_t*] *out_D,
         vector[float_ptr_vector] *probas,
         vector[floatData_t*] &idx_data,
         PartDescriptor &idx_desc,
@@ -62,12 +57,7 @@ cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
     ) except +
 
 
-def _free_mem(uniq_labels, n_unique):
-    free(<void*><uintptr_t>uniq_labels)
-    free(<void*><uintptr_t>n_unique)
-
-
-class KNeighborsClassifierMG(KNeighborsMG):
+class KNeighborsClassifierMG(NearestNeighborsMG):
     """
     Multi-node Multi-GPU K-Nearest Neighbors Classifier Model.
 
@@ -78,12 +68,12 @@ class KNeighborsClassifierMG(KNeighborsMG):
     def __init__(self, **kwargs):
         super(KNeighborsClassifierMG, self).__init__(**kwargs)
 
-    @cuml.internals.api_base_return_generic_skipall
+    @api_base_return_generic_skipall
     def predict(
         self,
-        data,
-        data_parts_to_ranks,
-        data_nrows,
+        index,
+        index_parts_to_ranks,
+        index_nrows,
         query,
         query_parts_to_ranks,
         query_nrows,
@@ -92,9 +82,7 @@ class KNeighborsClassifierMG(KNeighborsMG):
         ncols,
         rank,
         convert_dtype
-    ) -> typing.Tuple[typing.List[CumlArray],
-                      typing.List[CumlArray],
-                      typing.List[CumlArray]]:
+    ) -> typing.List[CumlArray]:
         """
         Predict labels for a query from previously stored index
         and index labels.
@@ -102,44 +90,49 @@ class KNeighborsClassifierMG(KNeighborsMG):
 
         Parameters
         ----------
-        data: [__cuda_array_interface__] of local index and labels partitions
-        data_parts_to_ranks: mappings of data partitions to ranks
-        data_nrows: number of total data rows
+        index: [__cuda_array_interface__] of local index partitions
+        index_parts_to_ranks: mappings of index partitions to ranks
+        index_nrows: number of index rows
         query: [__cuda_array_interface__] of local query partitions
         query_parts_to_ranks: mappings of query partitions to ranks
-        query_nrows: number of total query rows
-        uniq_labels: array of labels of a column
+        query_nrows: number of query rows
+        uniq_labels: array of arrays of possible labels for columns
         n_unique: array with number of possible labels for each columns
         ncols: number of columns
-        rank: int rank of current worker
+        rank: rank of current worker
+        n_neighbors: number of nearest neighbors to query
         convert_dtype: since only float32 inputs are supported, should
                the input be automatically converted?
 
         Returns
         -------
-        predictions : labels, indices, distances
+        predictions : labels
         """
-        self.get_out_type(data, query)
+        # Detect type
+        self.get_out_type(index, query)
 
-        input = self.gen_local_input(data, data_parts_to_ranks, data_nrows,
+        # Build input arrays and descriptors for native code interfacing
+        input = self.gen_local_input(index, index_parts_to_ranks, index_nrows,
                                      query, query_parts_to_ranks, query_nrows,
                                      ncols, rank, convert_dtype)
 
-        output = self.gen_local_output(data, convert_dtype, 'int32')
+        # Build input labels arrays and descriptors for native code interfacing
+        labels = self.gen_local_labels(index, convert_dtype, 'int32')
 
         query_cais = input['cais']['query']
         local_query_rows = list(map(lambda x: x.shape[0], query_cais))
-        result = self.alloc_local_output(local_query_rows)
 
+        # Build uniq_labels_vec vector for native code interfacing
         uniq_labels_d, _, _, _ = \
-            input_to_cuml_array(uniq_labels, order='C', check_dtype=np.int32,
-                                convert_to_dtype=np.int32)
+            input_to_cuml_array(uniq_labels, order='C', check_dtype='int32',
+                                convert_to_dtype='int32')
         cdef int* ptr = <int*><uintptr_t>uniq_labels_d.ptr
         cdef vector[int*] *uniq_labels_vec = new vector[int*]()
         for i in range(uniq_labels_d.shape[0]):
             uniq_labels_vec.push_back(<int*>ptr)
             ptr += <int>uniq_labels_d.shape[1]
 
+        # Build n_unique_vec vector for native code interfacing
         cdef vector[int] *n_unique_vec = \
             new vector[int]()
         for uniq_label in n_unique:
@@ -147,31 +140,31 @@ class KNeighborsClassifierMG(KNeighborsMG):
 
         n_outputs = len(n_unique)
 
+        # Build labels output array for native code interfacing
         cdef vector[intData_t*] *out_result_local_parts \
             = new vector[intData_t*]()
         output_cais = []
         for n_rows in local_query_rows:
             o_cai = CumlArray.zeros(shape=(n_rows, n_outputs),
-                                    order="C", dtype=np.int32)
+                                    order="C", dtype='int32')
             output_cais.append(o_cai)
             out_result_local_parts.push_back(new intData_t(
                 <int*><uintptr_t>o_cai.ptr, n_rows * n_outputs))
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
 
+        is_verbose = logger.should_log_for(logger.level_debug)
         knn_classify(
             handle_[0],
             out_result_local_parts,
-            <vector[int64Data_t*]*><uintptr_t>result['indices'],
-            <vector[floatData_t*]*><uintptr_t>result['distances'],
             <vector[float_ptr_vector]*>0,
             deref(<vector[floatData_t*]*><uintptr_t>
-                  input['data']['local_parts']),
-            deref(<PartDescriptor*><uintptr_t>input['data']['desc']),
+                  input['index']['local_parts']),
+            deref(<PartDescriptor*><uintptr_t>input['index']['desc']),
             deref(<vector[floatData_t*]*><uintptr_t>
                   input['query']['local_parts']),
             deref(<PartDescriptor*><uintptr_t>input['query']['desc']),
-            deref(<vector[int_ptr_vector]*><uintptr_t>output['outputs']),
+            deref(<vector[int_ptr_vector]*><uintptr_t>labels['labels']),
             deref(<vector[int*]*><uintptr_t>uniq_labels_vec),
             deref(<vector[int]*><uintptr_t>n_unique_vec),
             <bool>False,  # column-major index
@@ -179,27 +172,24 @@ class KNeighborsClassifierMG(KNeighborsMG):
             <bool>False,
             <int>self.n_neighbors,
             <size_t>self.batch_size,
-            <bool>self.verbose
+            <bool>is_verbose
         )
 
         self.handle.sync()
 
-        self.free_mem(input, result)
-        free(<void*><uintptr_t>output['outputs'])
-
-        _free_mem(<uintptr_t>uniq_labels_vec,
-                  <uintptr_t>n_unique_vec)
-
+        # Release memory
+        self.free_mem(input)
+        free(<void*><uintptr_t>labels['labels'])
+        self._free_unique(<uintptr_t>uniq_labels_vec,
+                          <uintptr_t>n_unique_vec)
         for i in range(out_result_local_parts.size()):
             free(<void*>out_result_local_parts.at(i))
         free(<void*><uintptr_t>out_result_local_parts)
 
-        return output_cais, \
-            result['cais']['indices'], \
-            result['cais']['distances']
+        return output_cais
 
-    @cuml.internals.api_base_return_generic_skipall
-    def predict_proba(self, data, data_parts_to_ranks, data_nrows,
+    @api_base_return_generic_skipall
+    def predict_proba(self, index, index_parts_to_ranks, index_nrows,
                       query, query_parts_to_ranks, query_nrows,
                       uniq_labels, n_unique, ncols, rank,
                       convert_dtype) -> tuple:
@@ -210,9 +200,9 @@ class KNeighborsClassifierMG(KNeighborsMG):
 
         Parameters
         ----------
-        data: [__cuda_array_interface__] of local index and labels partitions
-        data_parts_to_ranks: mappings of data partitions to ranks
-        data_nrows: number of total data rows
+        index: [__cuda_array_interface__] of local index and labels partitions
+        index_parts_to_ranks: mappings of index partitions to ranks
+        index_nrows: number of total index rows
         query: [__cuda_array_interface__] of local query partitions
         query_parts_to_ranks: mappings of query partitions to ranks
         query_nrows: number of total query rows
@@ -227,23 +217,28 @@ class KNeighborsClassifierMG(KNeighborsMG):
         -------
         predictions : labels, indices, distances
         """
-        self.get_out_type(data, query)
+        # Detect type
+        self.get_out_type(index, query)
 
-        input = self.gen_local_input(data, data_parts_to_ranks, data_nrows,
+        # Build input arrays and descriptors for native code interfacing
+        input = self.gen_local_input(index, index_parts_to_ranks, index_nrows,
                                      query, query_parts_to_ranks, query_nrows,
                                      ncols, rank, convert_dtype)
 
-        output = self.gen_local_output(data, convert_dtype, dtype='int32')
+        # Build input labels arrays and descriptors for native code interfacing
+        labels = self.gen_local_labels(index, convert_dtype, dtype='int32')
 
+        # Build uniq_labels_vec vector for native code interfacing
         uniq_labels_d, _, _, _ = \
-            input_to_cuml_array(uniq_labels, order='C', check_dtype=np.int32,
-                                convert_to_dtype=np.int32)
+            input_to_cuml_array(uniq_labels, order='C', check_dtype='int32',
+                                convert_to_dtype='int32')
         cdef int* ptr = <int*><uintptr_t>uniq_labels_d.ptr
         cdef vector[int*] *uniq_labels_vec = new vector[int*]()
         for i in range(uniq_labels_d.shape[0]):
             uniq_labels_vec.push_back(<int*>ptr)
             ptr += <int>uniq_labels_d.shape[1]
 
+        # Build n_unique_vec vector for native code interfacing
         cdef vector[int] *n_unique_vec = \
             new vector[int]()
         for uniq_label in n_unique:
@@ -258,31 +253,32 @@ class KNeighborsClassifierMG(KNeighborsMG):
 
         n_outputs = len(n_unique)
 
+        # Build probas output array for native code interfacing
         proba_cais = [[] for i in range(n_outputs)]
         for query_idx, n_rows in enumerate(local_query_rows):
             for target_idx, n_classes in enumerate(n_unique):
                 p_cai = CumlArray.zeros(shape=(n_rows, n_classes),
-                                        order="C", dtype=np.float32)
+                                        order="C", dtype='float32')
                 proba_cais[target_idx].append(p_cai)
 
                 probas_local_parts.at(query_idx).push_back(<float*><uintptr_t>
                                                            p_cai.ptr)
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
+        is_verbose = logger.should_log_for(logger.level_debug)
 
+        # Launch distributed operations
         knn_classify(
             handle_[0],
             <vector[intData_t*]*>0,
-            <vector[int64Data_t*]*>0,
-            <vector[floatData_t*]*>0,
             probas_local_parts,
             deref(<vector[floatData_t*]*><uintptr_t>
-                  input['data']['local_parts']),
-            deref(<PartDescriptor*><uintptr_t>input['data']['desc']),
+                  input['index']['local_parts']),
+            deref(<PartDescriptor*><uintptr_t>input['index']['desc']),
             deref(<vector[floatData_t*]*><uintptr_t>
                   input['query']['local_parts']),
             deref(<PartDescriptor*><uintptr_t>input['query']['desc']),
-            deref(<vector[int_ptr_vector]*><uintptr_t>output['outputs']),
+            deref(<vector[int_ptr_vector]*><uintptr_t>labels['labels']),
             deref(<vector[int*]*><uintptr_t>uniq_labels_vec),
             deref(<vector[int]*><uintptr_t>n_unique_vec),
             <bool>False,  # column-major index
@@ -290,21 +286,20 @@ class KNeighborsClassifierMG(KNeighborsMG):
             <bool>True,
             <int>self.n_neighbors,
             <size_t>self.batch_size,
-            <bool>self.verbose
+            <bool>is_verbose
         )
-
         self.handle.sync()
 
+        # Release memory
         self.free_mem(input)
-        free(<void*><uintptr_t>output['outputs'])
-
-        _free_mem(<uintptr_t>uniq_labels_vec,
-                  <uintptr_t>n_unique_vec)
-
+        free(<void*><uintptr_t>labels['labels'])
+        self._free_unique(<uintptr_t>uniq_labels_vec,
+                          <uintptr_t>n_unique_vec)
         free(<void*><uintptr_t>probas_local_parts)
 
-        probas_out = []
-        for i in range(n_outputs):
-            probas_out.append(proba_cais[i])
+        return tuple(proba_cais)
 
-        return tuple(probas_out)
+    @staticmethod
+    def _free_unique(uniq_labels, n_unique):
+        free(<void*><uintptr_t>uniq_labels)
+        free(<void*><uintptr_t>n_unique)
diff --git a/python/cuml/neighbors/kneighbors_mg.pyx b/python/cuml/neighbors/kneighbors_mg.pyx
deleted file mode 100644
index 4839a55eb5..0000000000
--- a/python/cuml/neighbors/kneighbors_mg.pyx
+++ /dev/null
@@ -1,186 +0,0 @@
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# distutils: language = c++
-
-import numpy as np
-
-import cuml.internals
-from cuml.common.array import CumlArray
-from cuml.raft.common.handle cimport handle_t
-from cuml.common import input_to_cuml_array
-from cuml.common.opg_data_utils_mg cimport *
-from cuml.common.opg_data_utils_mg import _build_part_inputs
-
-import rmm
-from libc.stdlib cimport calloc, malloc, free
-from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
-from libcpp cimport bool
-from libcpp.memory cimport shared_ptr
-
-from cuml.neighbors.nearest_neighbors_mg import NearestNeighbors
-from cudf.core import DataFrame as cudfDataFrame
-
-
-class KNeighborsMG(NearestNeighbors):
-    def __init__(self, batch_size=1024, **kwargs):
-        super(KNeighborsMG, self).__init__(**kwargs)
-        self.batch_size = batch_size
-
-    def get_out_type(self, data, query):
-        if len(data) > 0:
-            self._set_base_attributes(output_type=data[0])
-        out_type = self.output_type
-        if len(query) > 0:
-            out_type = self._get_output_type(query[0])
-
-        cuml.internals.set_api_output_type(out_type)
-        return out_type
-
-    def gen_local_input(self, data, data_parts_to_ranks, data_nrows,
-                        query, query_parts_to_ranks, query_nrows,
-                        ncols, rank, convert_dtype):
-        data_dask = [d[0] for d in data]
-        self.n_dims = ncols
-
-        data_cai, data_local_parts, data_desc = \
-            _build_part_inputs(data_dask, data_parts_to_ranks, data_nrows,
-                               ncols, rank, convert_dtype)
-
-        query_cai, query_local_parts, query_desc = \
-            _build_part_inputs(query, query_parts_to_ranks, query_nrows,
-                               ncols, rank, convert_dtype)
-
-        return {
-            'data': {
-                'local_parts': <uintptr_t>data_local_parts,
-                'desc': <uintptr_t>data_desc
-            },
-            'query': {
-                'local_parts': <uintptr_t>query_local_parts,
-                'desc': <uintptr_t>query_desc
-            },
-            'cais': {
-                'data': data_cai,
-                'query': query_cai
-            },
-        }
-
-    def gen_local_output(self, data, convert_dtype, dtype):
-        cdef vector[int_ptr_vector] *out_local_parts_i32
-        cdef vector[float_ptr_vector] *out_local_parts_f32
-
-        outputs = [d[1] for d in data]
-        n_out = len(outputs)
-
-        if dtype == 'int32':
-            out_local_parts_i32 = new vector[int_ptr_vector](<int>n_out)
-        elif dtype == 'float32':
-            out_local_parts_f32 = new vector[float_ptr_vector](<int>n_out)
-        else:
-            raise ValueError('Wrong dtype')
-
-        outputs_cai = []
-        for i, arr in enumerate(outputs):
-            for j in range(arr.shape[1]):
-                if isinstance(arr, cudfDataFrame):
-                    col = arr.iloc[:, j]
-                else:
-                    col = arr[:, j]
-                out_ai, _, _, _ = \
-                    input_to_cuml_array(col, order="F",
-                                        convert_to_dtype=(dtype
-                                                          if convert_dtype
-                                                          else None),
-                                        check_dtype=[dtype])
-                outputs_cai.append(out_ai)
-                if dtype == 'int32':
-                    out_local_parts_i32.at(i).push_back(<int*><uintptr_t>
-                                                        out_ai.ptr)
-                else:
-                    out_local_parts_f32.at(i).push_back(<float*><uintptr_t>
-                                                        out_ai.ptr)
-
-        return {
-            'outputs':
-                <uintptr_t>out_local_parts_i32 if dtype == 'int32'
-                else <uintptr_t>out_local_parts_f32,
-            'cais': [outputs_cai]
-        }
-
-    def alloc_local_output(self, local_query_rows):
-        cdef vector[int64Data_t*] *indices_local_parts \
-            = new vector[int64Data_t*]()
-        cdef vector[floatData_t*] *dist_local_parts \
-            = new vector[floatData_t*]()
-
-        indices_cai = []
-        dist_cai = []
-        for n_rows in local_query_rows:
-            i_cai = CumlArray.zeros(shape=(n_rows, self.n_neighbors),
-                                    order="C", dtype=np.int64)
-            d_cai = CumlArray.zeros(shape=(n_rows, self.n_neighbors),
-                                    order="C", dtype=np.float32)
-
-            indices_cai.append(i_cai)
-            dist_cai.append(d_cai)
-
-            indices_local_parts.push_back(new int64Data_t(
-                <int64_t*><uintptr_t>i_cai.ptr, n_rows * self.n_neighbors))
-
-            dist_local_parts.push_back(new floatData_t(
-                <float*><uintptr_t>d_cai.ptr, n_rows * self.n_neighbors))
-
-        return {
-            'indices': <uintptr_t>indices_local_parts,
-            'distances': <uintptr_t>dist_local_parts,
-            'cais': {
-                'indices': indices_cai,
-                'distances': dist_cai
-            }
-        }
-
-    def free_mem(self, input, result=None):
-        cdef floatData_t *f_ptr
-        cdef vector[floatData_t*] *f_lp
-
-        for input_type in ['data', 'query']:
-            ilp = input[input_type]['local_parts']
-            f_lp = <vector[floatData_t *]*><uintptr_t>ilp
-            for i in range(f_lp.size()):
-                f_ptr = f_lp.at(i)
-                free(<void*>f_ptr)
-            free(<void*><uintptr_t>f_lp)
-
-            free(<void*><uintptr_t>input[input_type]['desc'])
-
-        cdef int64Data_t *i64_ptr
-        cdef vector[int64Data_t*] *i64_lp
-
-        if result:
-
-            f_lp = <vector[floatData_t *]*><uintptr_t>result['distances']
-            for i in range(f_lp.size()):
-                f_ptr = f_lp.at(i)
-                free(<void*>f_ptr)
-            free(<void*><uintptr_t>f_lp)
-
-            i64_lp = <vector[int64Data_t *]*><uintptr_t>result['indices']
-            for i in range(i64_lp.size()):
-                i64_ptr = i64_lp.at(i)
-                free(<void*>i64_ptr)
-            free(<void*><uintptr_t>i64_lp)
diff --git a/python/cuml/neighbors/kneighbors_regressor_mg.pyx b/python/cuml/neighbors/kneighbors_regressor_mg.pyx
index 6e29395be5..517fda7cbd 100644
--- a/python/cuml/neighbors/kneighbors_regressor_mg.pyx
+++ b/python/cuml/neighbors/kneighbors_regressor_mg.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,25 +17,21 @@
 # distutils: language = c++
 
 import typing
+from cuml.common.array import CumlArray
+import cuml.common.logger as logger
+from cuml.internals import api_base_return_generic_skipall
 
-import numpy as np
+from cuml.neighbors.nearest_neighbors_mg import NearestNeighborsMG
 
-import cuml.internals
-from cuml.common.array import CumlArray
 from cuml.raft.common.handle cimport handle_t
-from cuml.common import input_to_cuml_array
 from cuml.common.opg_data_utils_mg cimport *
-from cuml.common.opg_data_utils_mg import _build_part_inputs
 
-import rmm
-from libc.stdlib cimport calloc, malloc, free
-from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+from libc.stdlib cimport free
 
-from cuml.neighbors.kneighbors_mg import KNeighborsMG
-from cudf.core import DataFrame as cudfDataFrame
 
 cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
         "ML::KNN::opg":
@@ -43,8 +39,6 @@ cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
     cdef void knn_regress(
         handle_t &handle,
         vector[floatData_t*] *out,
-        vector[int64Data_t*] *out_I,
-        vector[floatData_t*] *out_D,
         vector[floatData_t*] &idx_data,
         PartDescriptor &idx_desc,
         vector[floatData_t*] &query_data,
@@ -59,7 +53,7 @@ cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
     ) except +
 
 
-class KNeighborsRegressorMG(KNeighborsMG):
+class KNeighborsRegressorMG(NearestNeighborsMG):
     """
     Multi-node Multi-GPU K-Nearest Neighbors Regressor Model.
 
@@ -70,12 +64,12 @@ class KNeighborsRegressorMG(KNeighborsMG):
     def __init__(self, **kwargs):
         super(KNeighborsRegressorMG, self).__init__(**kwargs)
 
-    @cuml.internals.api_base_return_generic_skipall
+    @api_base_return_generic_skipall
     def predict(
         self,
-        data,
-        data_parts_to_ranks,
-        data_nrows,
+        index,
+        index_parts_to_ranks,
+        index_nrows,
         query,
         query_parts_to_ranks,
         query_nrows,
@@ -83,9 +77,7 @@ class KNeighborsRegressorMG(KNeighborsMG):
         n_outputs,
         rank,
         convert_dtype
-    ) -> typing.Tuple[typing.List[CumlArray],
-                      typing.List[CumlArray],
-                      typing.List[CumlArray]]:
+    ) -> typing.List[CumlArray]:
         """
         Predict outputs for a query from previously stored index
         and index labels.
@@ -93,74 +85,75 @@ class KNeighborsRegressorMG(KNeighborsMG):
 
         Parameters
         ----------
-        data: [__cuda_array_interface__] of local index and labels partitions
-        data_parts_to_ranks: mappings of data partitions to ranks
-        data_nrows: number of total data rows
+        index: [__cuda_array_interface__] of local index partitions
+        index_parts_to_ranks: mappings of index partitions to ranks
+        index_nrows: number of index rows
         query: [__cuda_array_interface__] of local query partitions
         query_parts_to_ranks: mappings of query partitions to ranks
-        query_nrows: number of total query rows
+        query_nrows: number of query rows
         ncols: number of columns
-        rank: int rank of current worker
+        n_outputs: number of outputs columns
+        rank: rank of current worker
         convert_dtype: since only float32 inputs are supported, should
                the input be automatically converted?
 
         Returns
         -------
-        predictions : outputs, indices, distances
+        predictions : labels
         """
-        self.get_out_type(data, query)
+        # Detect type
+        self.get_out_type(index, query)
 
-        input = self.gen_local_input(data, data_parts_to_ranks, data_nrows,
+        # Build input arrays and descriptors for native code interfacing
+        input = self.gen_local_input(index, index_parts_to_ranks, index_nrows,
                                      query, query_parts_to_ranks, query_nrows,
                                      ncols, rank, convert_dtype)
 
-        output = self.gen_local_output(data, convert_dtype, dtype='float32')
+        # Build input labels arrays and descriptors for native code interfacing
+        labels = self.gen_local_labels(index, convert_dtype, dtype='float32')
 
         query_cais = input['cais']['query']
         local_query_rows = list(map(lambda x: x.shape[0], query_cais))
-        result = self.alloc_local_output(local_query_rows)
 
+        # Build labels output array for native code interfacing
         cdef vector[floatData_t*] *out_result_local_parts \
             = new vector[floatData_t*]()
         output_cais = []
         for n_rows in local_query_rows:
             o_cai = CumlArray.zeros(shape=(n_rows, n_outputs),
-                                    order="C", dtype=np.float32)
+                                    order="C", dtype='float32')
             output_cais.append(o_cai)
             out_result_local_parts.push_back(new floatData_t(
                 <float*><uintptr_t>o_cai.ptr, n_rows * n_outputs))
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
+        is_verbose = logger.should_log_for(logger.level_debug)
 
+        # Launch distributed operations
         knn_regress(
             handle_[0],
             out_result_local_parts,
-            <vector[int64Data_t*]*><uintptr_t>result['indices'],
-            <vector[floatData_t*]*><uintptr_t>result['distances'],
             deref(<vector[floatData_t*]*><uintptr_t>
-                  input['data']['local_parts']),
-            deref(<PartDescriptor*><uintptr_t>input['data']['desc']),
+                  input['index']['local_parts']),
+            deref(<PartDescriptor*><uintptr_t>input['index']['desc']),
             deref(<vector[floatData_t*]*><uintptr_t>
                   input['query']['local_parts']),
             deref(<PartDescriptor*><uintptr_t>input['query']['desc']),
-            deref(<vector[float_ptr_vector]*><uintptr_t>output['outputs']),
+            deref(<vector[float_ptr_vector]*><uintptr_t>labels['labels']),
             <bool>False,  # column-major index
             <bool>False,  # column-major query
             <int>self.n_neighbors,
             <int>n_outputs,
             <size_t>self.batch_size,
-            <bool>self.verbose
+            <bool>is_verbose
         )
-
         self.handle.sync()
 
-        self.free_mem(input, result)
-        free(<void*><uintptr_t>output['outputs'])
-
+        # Release memory
+        self.free_mem(input)
+        free(<void*><uintptr_t>labels['labels'])
         for i in range(out_result_local_parts.size()):
             free(<void*>out_result_local_parts.at(i))
         free(<void*><uintptr_t>out_result_local_parts)
 
-        return output_cais, \
-            result['cais']['indices'], \
-            result['cais']['distances']
+        return output_cais
diff --git a/python/cuml/neighbors/nearest_neighbors_mg.pyx b/python/cuml/neighbors/nearest_neighbors_mg.pyx
index b792bd3a30..32e58df748 100644
--- a/python/cuml/neighbors/nearest_neighbors_mg.pyx
+++ b/python/cuml/neighbors/nearest_neighbors_mg.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,75 +16,33 @@
 
 # distutils: language = c++
 
-from cuml.neighbors import NearestNeighbors
-
-import numpy as np
-import pandas as pd
-import cudf
-import ctypes
-import warnings
 import typing
-
-from cuml.common.base import Base
 from cuml.common.array import CumlArray
 from cuml.common import input_to_cuml_array
-import cuml.internals
+from cuml.internals import api_base_return_generic_skipall
+import cuml.common.logger as logger
+from cudf.core import DataFrame as cudfDataFrame
 
-from cython.operator cimport dereference as deref
+from cuml.neighbors import NearestNeighbors
 
 from cuml.raft.common.handle cimport handle_t
+from cuml.common.opg_data_utils_mg cimport *
 from cuml.common.opg_data_utils_mg import _build_part_inputs
-import cuml.common.logger as logger
 
 from libcpp cimport bool
-from libcpp.memory cimport shared_ptr
-
-import rmm
-from libc.stdlib cimport malloc, free
 from libcpp.vector cimport vector
-
-from libc.stdint cimport uintptr_t, int64_t
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
 from libc.stdlib cimport calloc, malloc, free
 
-import rmm
-
-cimport cuml.common.cuda
-
-
-cdef extern from "cumlprims/opg/matrix/data.hpp" namespace \
-        "MLCommon::Matrix":
-
-    cdef cppclass Data[T]:
-        Data(T *ptr, size_t totalSize)
-
-    cdef cppclass floatData_t:
-        floatData_t(float *ptr, size_t totalSize)
-        float *ptr
-        size_t totalSize
-
-ctypedef Data[int64_t] int64Data_t
-
-
-cdef extern from "cumlprims/opg/matrix/part_descriptor.hpp" namespace \
-        "MLCommon::Matrix":
-
-    cdef cppclass RankSizePair:
-        int rank
-        size_t size
-
-    cdef cppclass PartDescriptor:
-        PartDescriptor(size_t M,
-                       size_t N,
-                       vector[RankSizePair*] &partsToRanks,
-                       int myrank)
 
 cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
         "ML::KNN::opg":
 
-    cdef void brute_force_knn(
+    cdef void knn(
         handle_t &handle,
-        vector[int64Data_t*] &out_I,
-        vector[floatData_t*] &out_D,
+        vector[int64Data_t*] *out_I,
+        vector[floatData_t*] *out_D,
         vector[floatData_t*] &idx_data,
         PartDescriptor &idx_desc,
         vector[floatData_t*] &query_data,
@@ -97,47 +55,6 @@ cdef extern from "cuml/neighbors/knn_mg.hpp" namespace \
     ) except +
 
 
-def _free_mem(index_desc, query_desc,
-              out_i_vec, out_d_vec,
-              local_index_parts,
-              local_query_parts):
-    cdef PartDescriptor *index_desc_c \
-        = <PartDescriptor*><size_t>index_desc
-    free(index_desc_c)
-
-    cdef PartDescriptor *query_desc_c \
-        = <PartDescriptor*><size_t>query_desc
-    free(query_desc_c)
-
-    cdef vector[int64Data_t *] *out_i_vec_c \
-        = <vector[int64Data_t *]*><size_t>out_i_vec
-    cdef int64Data_t *del_idx_ptr
-    for elm in range(out_i_vec_c.size()):
-        del_idx_ptr = out_i_vec_c.at(elm)
-        del del_idx_ptr
-    free(out_i_vec_c)
-
-    cdef vector[floatData_t *] *out_d_vec_c \
-        = <vector[floatData_t *]*><size_t>out_d_vec
-    cdef floatData_t *del_ptr
-    for elm in range(out_d_vec_c.size()):
-        del_ptr = out_d_vec_c.at(elm)
-        del del_ptr
-    free(out_d_vec_c)
-
-    cdef vector[RankSizePair *] *local_index_parts_c \
-        = <vector[RankSizePair *]*><size_t>local_index_parts
-    for elm in range(local_index_parts_c.size()):
-        free(local_index_parts_c.at(elm))
-    free(local_index_parts_c)
-
-    cdef vector[RankSizePair *] *local_query_parts_c \
-        = <vector[RankSizePair *]*><size_t>local_query_parts
-    for elm in range(local_query_parts_c.size()):
-        free(local_query_parts_c.at(elm))
-    free(local_query_parts_c)
-
-
 class NearestNeighborsMG(NearestNeighbors):
     """
     Multi-node multi-GPU Nearest Neighbors kneighbors query.
@@ -154,118 +71,225 @@ class NearestNeighborsMG(NearestNeighbors):
         super(NearestNeighborsMG, self).__init__(**kwargs)
         self.batch_size = batch_size
 
-    @cuml.internals.api_base_return_generic_skipall
+    @api_base_return_generic_skipall
     def kneighbors(
         self,
-        indices,
-        index_m,
-        n,
+        index,
         index_parts_to_ranks,
-        queries,
-        query_m,
+        index_nrows,
+        query,
         query_parts_to_ranks,
+        query_nrows,
+        ncols,
         rank,
-        n_neighbors=None,
-        convert_dtype=True
+        n_neighbors,
+        convert_dtype
     ) -> typing.Tuple[typing.List[CumlArray], typing.List[CumlArray]]:
         """
         Query the kneighbors of an index
 
         Parameters
         ----------
-        indices: [__cuda_array_interface__] of local index partitions
-        index_m: number of total index rows
-        n: number of columns
-        index_partsToRanks: mappings of index partitions to ranks
-        queries: [__cuda_array_interface__] of local query partitions
-        query_m: number of total query rows
-        query_partsToRanks: mappings of query partitions to ranks
-        rank: int rank of current worker
-        n_neighbors: int number of nearest neighbors to query
+        index: [__cuda_array_interface__] of local index partitions
+        index_parts_to_ranks: mappings of index partitions to ranks
+        index_nrows: number of index rows
+        query: [__cuda_array_interface__] of local query partitions
+        query_parts_to_ranks: mappings of query partitions to ranks
+        query_nrows: number of query rows
+        ncols: number of columns
+        rank: rank of current worker
+        n_neighbors: number of nearest neighbors to query
         convert_dtype: since only float32 inputs are supported, should
                the input be automatically converted?
 
         Returns
         -------
-        output indices, output distances
+        predictions : indices and distances
         """
-        self._set_base_attributes(output_type=indices[0])
-
-        # Specify the output return type
-        if len(queries) > 0:
-            cuml.internals.set_api_output_type(self._get_output_type(
-                                               queries[0]))
-
-        n_neighbors = self.n_neighbors if n_neighbors is None else n_neighbors
-
-        self.n_dims = n
-
-        cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
-
-        idx_cai, idx_local_parts, idx_desc = \
-            _build_part_inputs(indices, index_parts_to_ranks,
-                               index_m, n, rank, convert_dtype)
-
-        q_cai, q_local_parts, q_desc = \
-            _build_part_inputs(queries, query_parts_to_ranks,
-                               query_m, n, rank, convert_dtype)
-
-        cdef vector[int64Data_t*] *out_i_vec \
-            = new vector[int64Data_t*]()
-        cdef vector[floatData_t*] *out_d_vec \
-            = new vector[floatData_t*]()
-
-        output_i_arrs = []
-        output_d_arrs = []
-
-        cdef uintptr_t i_ptr
-        cdef uintptr_t d_ptr
-
-        for query_part in q_cai:
-
-            n_rows = query_part.shape[0]
-            i_ary = CumlArray.zeros((n_rows, n_neighbors),
-                                    order="C",
-                                    dtype=np.int64)
-            d_ary = CumlArray.zeros((n_rows, n_neighbors),
-                                    order="C",
-                                    dtype=np.float32)
+        # Detect type
+        self.get_out_type(index, query)
 
-            output_i_arrs.append(i_ary)
-            output_d_arrs.append(d_ary)
+        self.n_neighbors = self.n_neighbors if n_neighbors is None \
+            else n_neighbors
 
-            i_ptr = i_ary.ptr
-            d_ptr = d_ary.ptr
+        # Build input arrays and descriptors for native code interfacing
+        input = self.gen_local_input(index, index_parts_to_ranks, index_nrows,
+                                     query, query_parts_to_ranks, query_nrows,
+                                     ncols, rank, convert_dtype)
 
-            out_i_vec.push_back(new int64Data_t(
-                <int64_t*>i_ptr, n_rows * n_neighbors))
+        query_cais = input['cais']['query']
+        local_query_rows = list(map(lambda x: x.shape[0], query_cais))
 
-            out_d_vec.push_back(new floatData_t(
-                <float*>d_ptr, n_rows * n_neighbors))
+        # Build indices and distances outputs for native code interfacing
+        result = self.alloc_local_output(local_query_rows, self.n_neighbors)
 
+        cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
         is_verbose = logger.should_log_for(logger.level_debug)
-        brute_force_knn(
+
+        # Launch distributed operations
+        knn(
             handle_[0],
-            deref(out_i_vec),
-            deref(out_d_vec),
-            deref(<vector[floatData_t*]*><uintptr_t>idx_local_parts),
-            deref(<PartDescriptor*><uintptr_t>idx_desc),
-            deref(<vector[floatData_t*]*><uintptr_t>q_local_parts),
-            deref(<PartDescriptor*><uintptr_t>q_desc),
-            False,  # column-major index
-            False,  # column-major query
-            n_neighbors,
+            <vector[int64Data_t*]*><uintptr_t>result['indices'],
+            <vector[floatData_t*]*><uintptr_t>result['distances'],
+            deref(<vector[floatData_t*]*><uintptr_t>
+                  input['index']['local_parts']),
+            deref(<PartDescriptor*><uintptr_t>input['index']['desc']),
+            deref(<vector[floatData_t*]*><uintptr_t>
+                  input['query']['local_parts']),
+            deref(<PartDescriptor*><uintptr_t>input['query']['desc']),
+            <bool>False,  # column-major index
+            <bool>False,  # column-major query
+            <int>self.n_neighbors,
             <size_t>self.batch_size,
             <bool>is_verbose
         )
-
         self.handle.sync()
 
-        _free_mem(<size_t>idx_desc,
-                  <size_t>q_desc,
-                  <size_t>out_i_vec,
-                  <size_t>out_d_vec,
-                  <size_t>idx_local_parts,
-                  <size_t>q_local_parts)
+        # Release memory
+        self.free_mem(input, result)
+
+        return result['cais']['distances'], result['cais']['indices']
+
+    def get_out_type(self, index, query):
+        if len(index) > 0:
+            self._set_base_attributes(output_type=index[0])
+        if len(query) > 0:
+            self._set_base_attributes(output_type=query[0])
+
+    @staticmethod
+    def gen_local_input(index, index_parts_to_ranks, index_nrows,
+                        query, query_parts_to_ranks, query_nrows,
+                        ncols, rank, convert_dtype):
+        index_dask = [d[0] if isinstance(d, (list, tuple))
+                      else d for d in index]
+
+        index_cai, index_local_parts, index_desc = \
+            _build_part_inputs(index_dask, index_parts_to_ranks, index_nrows,
+                               ncols, rank, convert_dtype)
+
+        query_cai, query_local_parts, query_desc = \
+            _build_part_inputs(query, query_parts_to_ranks, query_nrows,
+                               ncols, rank, convert_dtype)
+
+        return {
+            'index': {
+                'local_parts': <uintptr_t>index_local_parts,
+                'desc': <uintptr_t>index_desc
+            },
+            'query': {
+                'local_parts': <uintptr_t>query_local_parts,
+                'desc': <uintptr_t>query_desc
+            },
+            'cais': {
+                'index': index_cai,
+                'query': query_cai
+            },
+        }
+
+    @staticmethod
+    def gen_local_labels(index, convert_dtype, dtype):
+        cdef vector[int_ptr_vector] *out_local_parts_i32
+        cdef vector[float_ptr_vector] *out_local_parts_f32
+
+        outputs = [d[1] for d in index]
+        n_out = len(outputs)
+
+        if dtype == 'int32':
+            out_local_parts_i32 = new vector[int_ptr_vector](<int>n_out)
+        elif dtype == 'float32':
+            out_local_parts_f32 = new vector[float_ptr_vector](<int>n_out)
+        else:
+            raise ValueError('Wrong dtype')
+
+        outputs_cai = []
+        for i, arr in enumerate(outputs):
+            for j in range(arr.shape[1]):
+                if isinstance(arr, cudfDataFrame):
+                    col = arr.iloc[:, j]
+                else:
+                    col = arr[:, j]
+                out_ai, _, _, _ = \
+                    input_to_cuml_array(col, order="F",
+                                        convert_to_dtype=(dtype
+                                                          if convert_dtype
+                                                          else None),
+                                        check_dtype=[dtype])
+                outputs_cai.append(out_ai)
+                if dtype == 'int32':
+                    out_local_parts_i32.at(i).push_back(<int*><uintptr_t>
+                                                        out_ai.ptr)
+                else:
+                    out_local_parts_f32.at(i).push_back(<float*><uintptr_t>
+                                                        out_ai.ptr)
+
+        return {
+            'labels':
+                <uintptr_t>out_local_parts_i32 if dtype == 'int32'
+                else <uintptr_t>out_local_parts_f32,
+            'cais': [outputs_cai]
+        }
+
+    @staticmethod
+    def alloc_local_output(local_query_rows, n_neighbors):
+        cdef vector[int64Data_t*] *indices_local_parts \
+            = new vector[int64Data_t*]()
+        cdef vector[floatData_t*] *dist_local_parts \
+            = new vector[floatData_t*]()
 
-        return output_i_arrs, output_d_arrs
+        indices_cai = []
+        dist_cai = []
+        for n_rows in local_query_rows:
+            i_cai = CumlArray.zeros(shape=(n_rows, n_neighbors),
+                                    order="C", dtype='int64')
+            d_cai = CumlArray.zeros(shape=(n_rows, n_neighbors),
+                                    order="C", dtype='float32')
+
+            indices_cai.append(i_cai)
+            dist_cai.append(d_cai)
+
+            indices_local_parts.push_back(new int64Data_t(
+                <int64_t*><uintptr_t>i_cai.ptr, n_rows * n_neighbors))
+
+            dist_local_parts.push_back(new floatData_t(
+                <float*><uintptr_t>d_cai.ptr, n_rows * n_neighbors))
+
+        return {
+            'indices': <uintptr_t>indices_local_parts,
+            'distances': <uintptr_t>dist_local_parts,
+            'cais': {
+                'indices': indices_cai,
+                'distances': dist_cai
+            }
+        }
+
+    @staticmethod
+    def free_mem(input, result=None):
+        cdef floatData_t *f_ptr
+        cdef vector[floatData_t*] *f_lp
+
+        for input_type in ['index', 'query']:
+            ilp = input[input_type]['local_parts']
+            f_lp = <vector[floatData_t *]*><uintptr_t>ilp
+            for i in range(f_lp.size()):
+                f_ptr = f_lp.at(i)
+                free(<void*>f_ptr)
+            free(<void*><uintptr_t>f_lp)
+
+            free(<void*><uintptr_t>input[input_type]['desc'])
+
+        cdef int64Data_t *i64_ptr
+        cdef vector[int64Data_t*] *i64_lp
+
+        if result:
+
+            f_lp = <vector[floatData_t *]*><uintptr_t>result['distances']
+            for i in range(f_lp.size()):
+                f_ptr = f_lp.at(i)
+                free(<void*>f_ptr)
+            free(<void*><uintptr_t>f_lp)
+
+            i64_lp = <vector[int64Data_t *]*><uintptr_t>result['indices']
+            for i in range(i64_lp.size()):
+                i64_ptr = i64_lp.at(i)
+                free(<void*>i64_ptr)
+            free(<void*><uintptr_t>i64_lp)
diff --git a/python/cuml/test/dask/test_nearest_neighbors.py b/python/cuml/test/dask/test_nearest_neighbors.py
index 3cc6fd37bc..a37d29ce89 100644
--- a/python/cuml/test/dask/test_nearest_neighbors.py
+++ b/python/cuml/test/dask/test_nearest_neighbors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
 import pytest
 
 import cudf

From a3c62b149506972bc5ce566561b99e871d91e802 Mon Sep 17 00:00:00 2001
From: William Hicks <wphicks@users.noreply.github.com>
Date: Tue, 2 Feb 2021 15:33:56 -0500
Subject: [PATCH 07/29] Ensure make_classification respects output type (#3415)

Switch to api_return_generic decorator in order to get correct output type from make_classification
Provide tests of global_output_type compliance for all dataset generators

Authors:
  - William Hicks (@wphicks)

Approvers:
  - John Zedlewski (@JohnZed)
  - Corey J. Nolet (@cjnolet)

URL: https://github.com/rapidsai/cuml/pull/3415
---
 python/cuml/datasets/classification.py        |  4 +-
 .../cuml/test/test_dataset_generator_types.py | 69 +++++++++++++++++++
 python/cuml/test/test_make_blobs.py           | 32 +--------
 3 files changed, 73 insertions(+), 32 deletions(-)
 create mode 100644 python/cuml/test/test_dataset_generator_types.py

diff --git a/python/cuml/datasets/classification.py b/python/cuml/datasets/classification.py
index c6e927228d..15ead64e4e 100644
--- a/python/cuml/datasets/classification.py
+++ b/python/cuml/datasets/classification.py
@@ -41,7 +41,7 @@ def _generate_hypercube(samples, dimensions, rng):
     return out
 
 
-@cuml.internals.api_return_any()
+@cuml.internals.api_return_generic()
 def make_classification(n_samples=100, n_features=20, n_informative=2,
                         n_redundant=2, n_repeated=0, n_classes=2,
                         n_clusters_per_class=2, weights=None, flip_y=0.01,
@@ -205,6 +205,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
            selection benchmark", 2003.
 
     """
+    cuml.internals.set_api_output_type("cupy")
+
     generator = _create_rs_generator(random_state)
     np_seed = int(generator.randint(n_samples, size=1))
     np.random.seed(np_seed)
diff --git a/python/cuml/test/test_dataset_generator_types.py b/python/cuml/test/test_dataset_generator_types.py
new file mode 100644
index 0000000000..e21fb5cf66
--- /dev/null
+++ b/python/cuml/test/test_dataset_generator_types.py
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import cudf
+import cupy as cp
+import numba
+import numpy as np
+import pytest
+
+import cuml
+from cuml.datasets import (
+    make_arima,
+    make_blobs,
+    make_classification,
+    make_regression
+)
+
+
+TEST_OUTPUT_TYPES = (
+    (None, (cp.ndarray, cp.ndarray)),  # Default is cupy if None is used
+    ('numpy', (np.ndarray, np.ndarray)),
+    ('cupy', (cp.ndarray, cp.ndarray)),
+    ('numba', (numba.cuda.devicearray.DeviceNDArrayBase,
+               numba.cuda.devicearray.DeviceNDArrayBase)),
+    ('cudf', (cudf.DataFrame, cudf.Series))
+)
+
+GENERATORS = (
+    make_blobs, make_classification, make_regression
+)
+
+
+@pytest.mark.parametrize('generator', GENERATORS)
+@pytest.mark.parametrize(
+    'output_str,output_types', TEST_OUTPUT_TYPES
+)
+def test_xy_output_type(generator, output_str, output_types):
+
+    # Set the output type and ensure data of that type is generated
+    with cuml.using_output_type(output_str):
+        data = generator(n_samples=10, random_state=0)
+
+    for data, type_ in zip(data, output_types):
+        assert isinstance(data, type_)
+
+
+@pytest.mark.parametrize(
+    'output_str,output_types', TEST_OUTPUT_TYPES
+)
+def test_time_series_label_output_type(output_str, output_types):
+
+    # Set the output type and ensure data of that type is generated
+    with cuml.using_output_type(output_str):
+        data = make_arima(n_obs=10, random_state=0)[0]
+
+    assert isinstance(data, output_types[1])
diff --git a/python/cuml/test/test_make_blobs.py b/python/cuml/test/test_make_blobs.py
index d212a3ae98..b6e0b1794d 100644
--- a/python/cuml/test/test_make_blobs.py
+++ b/python/cuml/test/test_make_blobs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 import cuml
 import pytest
 import cupy as cp
-import cudf
-import numpy as np
-import numba.cuda
 
 # Testing parameters for scalar parameter tests
 
@@ -96,30 +93,3 @@ def test_make_blobs_scalar_parameters(dtype, n_samples, n_features, centers,
     elif centers <= n_samples:
         assert cp.unique(labels).shape == (centers,), \
             "unexpected number of clusters"
-
-
-test_output_types = {
-    None: cp.ndarray,  # Default is cupy if None is used
-    'numpy': np.ndarray,
-    'cupy': cp.ndarray,
-    'numba': numba.cuda.devicearray.DeviceNDArrayBase,
-    'cudf': (cudf.DataFrame, cudf.Series)
-}
-
-
-@pytest.mark.parametrize("input_type", test_output_types.keys())
-def test_output_type(input_type: str):
-
-    # Set the output type and ensure its respected by the function
-    with cuml.using_output_type(input_type):
-        X, y = cuml.make_blobs(n_samples=10,
-                               centers=3,
-                               n_features=2,
-                               random_state=0)
-
-        if (isinstance(test_output_types[input_type], tuple)):
-            assert (isinstance(X, test_output_types[input_type][0]))
-            assert (isinstance(y, test_output_types[input_type][1]))
-        else:
-            assert (isinstance(X, test_output_types[input_type]))
-            assert (isinstance(y, test_output_types[input_type]))

From 9196252d7734ac51df6919858ce1c8ee32912310 Mon Sep 17 00:00:00 2001
From: "Thejaswi. N. S" <snanditale@nvidia.com>
Date: Wed, 3 Feb 2021 13:31:37 +0530
Subject: [PATCH 08/29] genetic programming initial structures (#3387)

This PR introduces/proposes some of the basic and core (gpu-friendly!) data structures for implementing gplearn in cuML in order to address the issue #2121 .

Tagging all who will be involved in this development: @vinaydes @venkywonka @vimarsh6739.

PS: It also contains an experimental register-based stack implementation that will be useful while implementing CUDA-based AST evaluation, which is needed for organizing tournaments.

Authors:
  - Thejaswi. N. S (@teju85)

Approvers:
  - Corey J. Nolet (@cjnolet)

URL: https://github.com/rapidsai/cuml/pull/3387
---
 cpp/CMakeLists.txt                 |   2 +
 cpp/include/cuml/genetic/genetic.h | 130 +++++++++++++++++++++++
 cpp/include/cuml/genetic/node.h    | 162 +++++++++++++++++++++++++++++
 cpp/include/cuml/genetic/program.h |  47 +++++++++
 cpp/src/genetic/genetic.cu         |  26 +++++
 cpp/src/genetic/genetic.cuh        |  41 ++++++++
 cpp/src/genetic/node.cu            |  95 +++++++++++++++++
 cpp/src/genetic/node.cuh           | 132 +++++++++++++++++++++++
 cpp/src/genetic/reg_stack.cuh      | 100 ++++++++++++++++++
 cpp/test/CMakeLists.txt            |   4 +-
 cpp/test/sg/genetic/node_test.cpp  |  63 +++++++++++
 cpp/test/sg/genetic/param_test.cu  |  63 +++++++++++
 12 files changed, 864 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/cuml/genetic/genetic.h
 create mode 100644 cpp/include/cuml/genetic/node.h
 create mode 100644 cpp/include/cuml/genetic/program.h
 create mode 100644 cpp/src/genetic/genetic.cu
 create mode 100644 cpp/src/genetic/genetic.cuh
 create mode 100644 cpp/src/genetic/node.cu
 create mode 100644 cpp/src/genetic/node.cuh
 create mode 100644 cpp/src/genetic/reg_stack.cuh
 create mode 100644 cpp/test/sg/genetic/node_test.cpp
 create mode 100644 cpp/test/sg/genetic/param_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bba25ddb0e..4049c58926 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -400,6 +400,8 @@ if(BUILD_CUML_CPP_LIBRARY)
     src/fil/fil.cu
     src/fil/infer.cu
     src/glm/glm.cu
+    src/genetic/genetic.cu
+    src/genetic/node.cu
     src/holtwinters/holtwinters.cu
     src/kmeans/kmeans.cu
     src/knn/knn.cu
diff --git a/cpp/include/cuml/genetic/genetic.h b/cpp/include/cuml/genetic/genetic.h
new file mode 100644
index 0000000000..a08b70d497
--- /dev/null
+++ b/cpp/include/cuml/genetic/genetic.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "node.h"
+#include "program.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace cuml {
+namespace genetic {
+
+/** Type of initialization of the member programs in the population */
+enum class init_method_t : uint32_t {
+  /** random nodes chosen, allowing shorter or asymmetrical trees */
+  grow,
+  /** growing till a randomly chosen depth */
+  full,
+  /** 50% of the population on `grow` and the rest with `full` */
+  half_and_half,
+};  // enum class init_method_t
+
+/** fitness metric types */
+enum class metric_t : uint32_t {
+  /** mean absolute error (regression-only) */
+  mae,
+  /** mean squared error (regression-only) */
+  mse,
+  /** root mean squared error (regression-only) */
+  rmse,
+  /** pearson product-moment coefficient (regression and transformation) */
+  pearson,
+  /** spearman's rank-order coefficient (regression and transformation) */
+  spearman,
+  /** binary cross-entropy loss (classification-only) */
+  logloss,
+};  // enum class metric_t
+
+enum class transformer_t : uint32_t {
+  /** sigmoid function */
+  sigmoid,
+};  // enum class transformer_t
+
+/**
+ * @brief contains all the hyper-parameters for training
+ *
+ * @note Unless otherwise mentioned, all the parameters below are applicable to
+ *       all of classification, regression and transformation.
+ */
+struct param {
+  /** number of programs in each generation */
+  int population_size = 1000;
+  /**
+   * number of fittest programs to compare during correlation
+   * (transformation-only)
+   */
+  int hall_of_fame = 100;
+  /**
+   * number of fittest programs to return from `hall_of_fame` top programs
+   * (transformation-only)
+   */
+  int n_components = 10;
+  /** number of generations to evolve */
+  int generations = 20;
+  /**
+   * number of programs that compete in the tournament to become part of next
+   * generation
+   */
+  int tournament_size = 20;
+  /** metric threshold used for early stopping */
+  float stopping_criteria = 0.0f;
+  /** minimum/maximum value for `constant` nodes */
+  float const_range[2] = {-1.0f, 1.0f};
+  /** minimum/maximum depth of programs after initialization */
+  int init_depth[2] = {2, 6};
+  /** initialization method */
+  init_method_t init_method = init_method_t::half_and_half;
+  /** list of functions to choose from */
+  std::vector<node::type> function_set{node::type::add, node::type::mul,
+                                       node::type::div, node::type::sub};
+  /** transformation function to class probabilities (classification-only) */
+  transformer_t transformer = transformer_t::sigmoid;
+  /** fitness metric */
+  metric_t metric = metric_t::mae;
+  /** penalization factor for large programs */
+  float parsimony_coefficient = 0.001f;
+  /** crossover mutation probability of the tournament winner */
+  float p_crossover = 0.9f;
+  /** subtree mutation probability of the tournament winner*/
+  float p_subtree_mutation = 0.01f;
+  /** hoist mutation probability of the tournament winner */
+  float p_hoist_mutation = 0.01f;
+  /** point mutation probabiilty of the tournament winner */
+  float p_point_mutation = 0.01f;
+  /** point replace probabiility for point mutations */
+  float p_point_replace = 0.05f;
+  /** subsampling factor */
+  float max_samples = 1.0f;
+  /** list of feature names for generating syntax trees from the programs */
+  std::vector<std::string> feature_names;
+  ///@todo: feature_names
+  ///@todo: verbose
+  /** random seed used for RNG */
+  uint64_t random_state = 0ull;
+
+  /** Computes the probability of 'reproduction' */
+  float p_reproduce() const;
+
+  /** maximum possible number of programs */
+  int max_programs() const;
+};  // struct param
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/include/cuml/genetic/node.h b/cpp/include/cuml/genetic/node.h
new file mode 100644
index 0000000000..6a657e86a4
--- /dev/null
+++ b/cpp/include/cuml/genetic/node.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace cuml {
+namespace genetic {
+
+/**
+ * @brief Represents a node in the syntax tree.
+ *
+ * @code{.cpp}
+ * // A non-terminal (aka function) node
+ * node func_node{node::type::sub};
+ * // A constant node
+ * float const_value = 2.f;
+ * node const_node{const_value};
+ * // A variable (aka feature) node
+ * node var_node{20};
+ * @endcode
+ */
+struct node {
+  /**
+   * @brief All possible types of nodes. For simplicity, all the terminal and
+   *        non-terminal types are clubbed together
+   */
+  enum class type : uint32_t {
+    variable = 0,
+    constant,
+
+    // note: keep the case statements in alphabetical order under each category
+    // of operators.
+    functions_begin,
+    // different binary function types follow
+    binary_begin = functions_begin,
+    add = binary_begin,
+    atan2,
+    div,
+    fdim,
+    max,
+    min,
+    mul,
+    pow,
+    sub,
+    binary_end = sub,  // keep this to be the last binary function in the list
+    // different unary function types follow
+    unary_begin,
+    abs = unary_begin,
+    acos,
+    acosh,
+    asin,
+    asinh,
+    atan,
+    atanh,
+    cbrt,
+    cos,
+    cosh,
+    cube,
+    exp,
+    inv,
+    log,
+    neg,
+    rcbrt,
+    rsqrt,
+    sin,
+    sinh,
+    sq,
+    sqrt,
+    tan,
+    tanh,
+    unary_end = tanh,  // keep this to be the last unary function in the list
+    functions_end = unary_end,
+  };  // enum type
+
+  /**
+   * @brief Construct a function node
+   *
+   * @param[in] ft function type
+   */
+  explicit node(type ft);
+
+  /**
+   * @brief Construct a variable node
+   *
+   * @param[in] fid feature id that represents the variable
+   */
+  explicit node(int fid);
+
+  /**
+   * @brief Construct a constant node
+   *
+   * @param[in] val constant value
+   */
+  explicit node(float val);
+
+  /**
+   * @param[in] src source node to be copied
+   */
+  explicit node(const node& src);
+
+  /**
+   * @brief assignment operator
+   *
+   * @param[in] src source node to be copied
+   *
+   * @return current node reference
+   */
+  node& operator=(const node& src);
+
+  /** whether the current is either a variable or a constant */
+  bool is_terminal() const;
+
+  /** whether the current node is a function */
+  bool is_nonterminal() const;
+
+  /** Get the arity of the node. If it is a terminal, then a 0 is returned */
+  int arity() const;
+
+  /**
+   * @brief Helper method to get node type from input string
+   *
+   * @param[in] ntype node type in string. Possible strings correlate one-to-one
+   *                  with the enum values for `type`
+   *
+   * @return `type`
+   */
+  static type from_str(const std::string& ntype);
+
+  /** constant used to represent invalid feature id */
+  static const int kInvalidFeatureId;
+
+  /** node type */
+  type t;
+  union {
+    /**
+     * if the node is `variable` type, then this is the column id to be used to
+     * fetch its value, from the input dataset
+     */
+    int fid;
+    /** if the node is `constant` type, then this is the value of the node */
+    float val;
+  } u;
+};  // struct node
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/include/cuml/genetic/program.h b/cpp/include/cuml/genetic/program.h
new file mode 100644
index 0000000000..ceec22a7e5
--- /dev/null
+++ b/cpp/include/cuml/genetic/program.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "node.h"
+
+namespace cuml {
+namespace genetic {
+
+/**
+ * @brief The main data structure to store the AST that represents a program
+ *        in the current generation
+ */
+struct program {
+  /**
+   * the AST. It is stored in the reverse of DFS-right-child-first order. In
+   * other words, construct a regular AST in the form of depth-first, but
+   * instead of storing the left child first, store the right child and so on.
+   * Now take the resulting 1D array and reverse it.
+   *
+   * @note The pointed memory buffer is NOT owned by this class and further it
+   *       is assumed to be a zero-copy (aka pinned memory) buffer, atleast in
+   *       this initial version
+   */
+  node* nodes;
+  /** total number of nodes in this AST */
+  int len;
+  /** maximum depth of this AST */
+  int depth;
+};  // struct program
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/src/genetic/genetic.cu b/cpp/src/genetic/genetic.cu
new file mode 100644
index 0000000000..fa9dba9987
--- /dev/null
+++ b/cpp/src/genetic/genetic.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "genetic.cuh"
+namespace cuml {
+namespace genetic {
+
+float param::p_reproduce() const { return detail::p_reproduce(*this); }
+
+int param::max_programs() const { return detail::max_programs(*this); }
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/src/genetic/genetic.cuh b/cpp/src/genetic/genetic.cuh
new file mode 100644
index 0000000000..67058c3677
--- /dev/null
+++ b/cpp/src/genetic/genetic.cuh
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuml/genetic/genetic.h>
+#include <raft/cuda_utils.cuh>
+
+namespace cuml {
+namespace genetic {
+namespace detail {
+
+HDI float p_reproduce(const param& p) {
+  auto sum = p.p_crossover + p.p_subtree_mutation + p.p_hoist_mutation +
+             p.p_point_mutation;
+  auto ret = 1.f - sum;
+  return fmaxf(0.f, fminf(ret, 1.f));
+}
+
+HDI int max_programs(const param& p) {
+  // in the worst case every generation's top program ends up reproducing,
+  // thereby adding another program into the population
+  return p.population_size + p.generations;
+}
+
+}  // namespace detail
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/src/genetic/node.cu b/cpp/src/genetic/node.cu
new file mode 100644
index 0000000000..a1668998b7
--- /dev/null
+++ b/cpp/src/genetic/node.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/common/utils.hpp>
+#include "node.cuh"
+
+namespace cuml {
+namespace genetic {
+
+const int node::kInvalidFeatureId = -1;
+
+node::node(node::type ft) : t(ft) {
+  ASSERT(is_nonterminal(),
+         "node: ctor with `type` argument expects functions type only!");
+  u.fid = kInvalidFeatureId;
+}
+
+node::node(int fid) : t(node::type::variable) { u.fid = fid; }
+
+node::node(float val) : t(node::type::constant) { u.val = val; }
+
+node::node(const node& src) : t(src.t), u(src.u) {}
+
+node& node::operator=(const node& src) {
+  t = src.t;
+  u = src.u;
+  return *this;
+}
+
+bool node::is_terminal() const { return detail::is_terminal(t); }
+
+bool node::is_nonterminal() const { return detail::is_nonterminal(t); }
+
+int node::arity() const { return detail::arity(t); }
+
+#define CASE(str, val) \
+  if (#val == str) return node::type::val
+node::type node::from_str(const std::string& ntype) {
+  CASE(ntype, variable);
+  CASE(ntype, constant);
+  // note: keep the case statements in alphabetical order under each category of
+  // operators.
+  // binary operators
+  CASE(ntype, add);
+  CASE(ntype, atan2);
+  CASE(ntype, div);
+  CASE(ntype, fdim);
+  CASE(ntype, max);
+  CASE(ntype, min);
+  CASE(ntype, mul);
+  CASE(ntype, pow);
+  CASE(ntype, sub);
+  // unary operators
+  CASE(ntype, abs);
+  CASE(ntype, acos);
+  CASE(ntype, asin);
+  CASE(ntype, atan);
+  CASE(ntype, acosh);
+  CASE(ntype, asinh);
+  CASE(ntype, atanh);
+  CASE(ntype, cbrt);
+  CASE(ntype, cos);
+  CASE(ntype, cosh);
+  CASE(ntype, cube);
+  CASE(ntype, exp);
+  CASE(ntype, inv);
+  CASE(ntype, log);
+  CASE(ntype, neg);
+  CASE(ntype, rcbrt);
+  CASE(ntype, rsqrt);
+  CASE(ntype, sq);
+  CASE(ntype, sqrt);
+  CASE(ntype, sin);
+  CASE(ntype, sinh);
+  CASE(ntype, tan);
+  CASE(ntype, tanh);
+  ASSERT(false, "node::from_str: Bad type passed '%s'!", ntype.c_str());
+}
+#undef CASE
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/src/genetic/node.cuh b/cpp/src/genetic/node.cuh
new file mode 100644
index 0000000000..6763bb0c0f
--- /dev/null
+++ b/cpp/src/genetic/node.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuml/genetic/node.h>
+#include <raft/cuda_utils.cuh>
+
+namespace cuml {
+namespace genetic {
+namespace detail {
+
+static constexpr float MIN_VAL = 0.001f;
+
+HDI bool is_terminal(node::type t) {
+  return t == node::type::variable || t == node::type::constant;
+}
+
+HDI bool is_nonterminal(node::type t) { return !is_terminal(t); }
+
+HDI int arity(node::type t) {
+  if (node::type::unary_begin <= t && t <= node::type::unary_end) {
+    return 1;
+  }
+  if (node::type::binary_begin <= t && t <= node::type::binary_end) {
+    return 2;
+  }
+  return 0;
+}
+
+// `data` assumed to be stored in col-major format
+DI float evaluate_node(const node& n, const float* data, size_t stride,
+                       float inval, float inval1) {
+  if (n.t == node::type::constant) {
+    return n.u.val;
+  } else if (n.t == node::type::variable) {
+    return n.u.fid != node::kInvalidFeatureId ? data[n.u.fid * stride] : 0.f;
+  } else {
+    auto abs_inval = fabsf(inval), abs_inval1 = fabsf(inval1);
+    auto small = abs_inval < MIN_VAL;
+    // note: keep the case statements in alphabetical order under each category
+    // of operators.
+    switch (n.t) {
+      // binary operators
+      case node::type::add:
+        return inval + inval1;
+      case node::type::atan2:
+        return atan2f(inval, inval1);
+      case node::type::div:
+        return abs_inval1 < MIN_VAL ? 1.f : fdividef(inval, inval1);
+      case node::type::fdim:
+        return fdimf(inval, inval1);
+      case node::type::max:
+        return fmaxf(inval, inval1);
+      case node::type::min:
+        return fminf(inval, inval1);
+      case node::type::mul:
+        return inval * inval1;
+      case node::type::pow:
+        return powf(inval, inval1);
+      case node::type::sub:
+        return inval - inval1;
+      // unary operators
+      case node::type::abs:
+        return abs_inval;
+      case node::type::acos:
+        return acosf(inval);
+      case node::type::acosh:
+        return acoshf(inval);
+      case node::type::asin:
+        return asinf(inval);
+      case node::type::asinh:
+        return asinhf(inval);
+      case node::type::atan:
+        return atanf(inval);
+      case node::type::atanh:
+        return atanhf(inval);
+      case node::type::cbrt:
+        return cbrtf(inval);
+      case node::type::cos:
+        return cosf(inval);
+      case node::type::cosh:
+        return coshf(inval);
+      case node::type::cube:
+        return inval * inval * inval;
+      case node::type::exp:
+        return expf(inval);
+      case node::type::inv:
+        return abs_inval < MIN_VAL ? 0.f : 1.f / inval;
+      case node::type::log:
+        return abs_inval < MIN_VAL ? 0.f : logf(abs_inval);
+      case node::type::neg:
+        return -inval;
+      case node::type::rcbrt:
+        return rcbrtf(inval);
+      case node::type::rsqrt:
+        return rsqrtf(abs_inval);
+      case node::type::sin:
+        return sinf(inval);
+      case node::type::sinh:
+        return sinhf(inval);
+      case node::type::sq:
+        return inval * inval;
+      case node::type::sqrt:
+        return sqrtf(abs_inval);
+      case node::type::tan:
+        return tanf(inval);
+      case node::type::tanh:
+        return tanhf(inval);
+      // shouldn't reach here!
+      default:
+        return 0.f;
+    };
+  }
+}
+
+}  // namespace detail
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/src/genetic/reg_stack.cuh b/cpp/src/genetic/reg_stack.cuh
new file mode 100644
index 0000000000..e0f6762c00
--- /dev/null
+++ b/cpp/src/genetic/reg_stack.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+
+namespace cuml {
+namespace genetic {
+
+/**
+ * @brief A fixed capacity stack on device currently used for AST evaluation
+ *
+ * The idea is to use only the registers to store the elements of the stack,
+ * thereby achieving the best performance.
+ *
+ * @tparam DataT   data type of the stack elements
+ * @tparam MaxSize max capacity of the stack
+ */
+template <typename DataT, int MaxSize>
+struct stack {
+  explicit HDI stack() : elements_(0) {
+#pragma unroll
+    for (int i = 0; i < MaxSize; ++i) {
+      regs_[i] = DataT(0);
+    }
+  }
+
+  /** Checks if the stack is empty */
+  HDI bool empty() const { return elements_ == 0; }
+
+  /** Current number of elements in the stack */
+  HDI int size() const { return elements_; }
+
+  /** Checks if the number of elements in the stack equal its capacity */
+  HDI bool full() const { return elements_ == MaxSize; }
+
+  /**
+   * @brief Pushes the input element to the top of the stack
+   *
+   * @param[in] val input element to be pushed
+   *
+   * @note If called when the stack is already full, then it is a no-op! To keep
+   *       the device-side logic simpler, it has been designed this way. Trying
+   *       to push more than `MaxSize` elements leads to all sorts of incorrect
+   *       behavior.
+   */
+  HDI void push(DataT val) {
+#pragma unroll
+    for (int i = 0; i < MaxSize; ++i) {
+      if (elements_ == i) {
+        regs_[i] = val;
+        ++elements_;
+      }
+    }
+  }
+
+  /**
+   * @brief Pops the top element from the stack
+   *
+   * @return pops the element and returns it, if already reached bottom, then it
+   *         returns zero.
+   *
+   * @note If called when the stack is already empty, then it just returns a
+   *       value of zero! To keep the device-side logic simpler, it has been
+   *       designed this way. Trying to pop beyond the bottom of the stack leads
+   *       to all sorts of incorrect behavior.
+   */
+  HDI DataT pop() {
+#pragma unroll
+    for (int i = 0; i < MaxSize; ++i) {
+      if (elements_ - 1 == i) {
+        --elements_;
+        return regs_[i];
+      }
+    }
+    // shouldn't reach here!
+    return DataT(0);
+  }
+
+ private:
+  int elements_;
+  DataT regs_[MaxSize];
+};  // struct stack
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 9130c5e450..e3b8513178 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -51,13 +51,15 @@ if(BUILD_CUML_TESTS)
     sg/decisiontree_batchedlevel_algo.cu
     sg/decisiontree_batchedlevel_unittest.cu
     sg/fil_test.cu
-    sg/multi_sum_test.cu
+    sg/genetic/node_test.cpp
+    sg/genetic/param_test.cu
     sg/handle_test.cu
     sg/holtwinters_test.cu
     sg/kmeans_test.cu
     sg/knn_test.cu
     sg/lars_test.cu
     sg/logger.cpp
+    sg/multi_sum_test.cu
     sg/nvtx_test.cpp
     sg/ols.cu
     sg/pca_test.cu
diff --git a/cpp/test/sg/genetic/node_test.cpp b/cpp/test/sg/genetic/node_test.cpp
new file mode 100644
index 0000000000..127623ca6d
--- /dev/null
+++ b/cpp/test/sg/genetic/node_test.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/genetic/node.h>
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+
+namespace cuml {
+namespace genetic {
+
+TEST(Genetic, node_test) {
+  node feature(1);
+  ASSERT_EQ(feature.t, node::type::variable);
+  ASSERT_TRUE(feature.is_terminal());
+  ASSERT_FALSE(feature.is_nonterminal());
+  ASSERT_EQ(feature.arity(), 0);
+  ASSERT_EQ(feature.u.fid, 1);
+
+  node constval(0.1f);
+  ASSERT_EQ(constval.t, node::type::constant);
+  ASSERT_TRUE(constval.is_terminal());
+  ASSERT_FALSE(constval.is_nonterminal());
+  ASSERT_EQ(constval.arity(), 0);
+  ASSERT_EQ(constval.u.val, 0.1f);
+
+  node func1(node::type::add);
+  ASSERT_EQ(func1.t, node::type::add);
+  ASSERT_FALSE(func1.is_terminal());
+  ASSERT_TRUE(func1.is_nonterminal());
+  ASSERT_EQ(func1.arity(), 2);
+  ASSERT_EQ(func1.u.fid, node::kInvalidFeatureId);
+
+  node func2(node::type::cosh);
+  ASSERT_EQ(func2.t, node::type::cosh);
+  ASSERT_FALSE(func2.is_terminal());
+  ASSERT_TRUE(func2.is_nonterminal());
+  ASSERT_EQ(func2.arity(), 1);
+  ASSERT_EQ(func2.u.fid, node::kInvalidFeatureId);
+}
+
+TEST(Genetic, node_from_str) {
+  ASSERT_EQ(node::from_str("add"), node::type::add);
+  ASSERT_EQ(node::from_str("tanh"), node::type::tanh);
+  ASSERT_THROW(node::from_str("bad_type"), raft::exception);
+}
+
+TEST(Genetic, node_constants) { ASSERT_EQ(node::kInvalidFeatureId, -1); }
+
+}  // namespace genetic
+}  // namespace cuml
diff --git a/cpp/test/sg/genetic/param_test.cu b/cpp/test/sg/genetic/param_test.cu
new file mode 100644
index 0000000000..9507e2bdb7
--- /dev/null
+++ b/cpp/test/sg/genetic/param_test.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/genetic/genetic.h>
+#include <gtest/gtest.h>
+#include "../../prims/test_utils.h"
+
+namespace cuml {
+namespace genetic {
+
+TEST(Genetic, ParamTest) {
+  param p;
+  ASSERT_EQ(p.population_size, 1000);
+  ASSERT_EQ(p.hall_of_fame, 100);
+  ASSERT_EQ(p.n_components, 10);
+  ASSERT_EQ(p.generations, 20);
+  ASSERT_EQ(p.tournament_size, 20);
+  ASSERT_EQ(p.stopping_criteria, 0.0f);
+  ASSERT_EQ(p.const_range[0], -1.0f);
+  ASSERT_EQ(p.const_range[1], 1.0f);
+  ASSERT_EQ(p.init_depth[0], 2);
+  ASSERT_EQ(p.init_depth[1], 6);
+  ASSERT_EQ(p.init_method, init_method_t::half_and_half);
+  ASSERT_EQ(p.function_set.size(), 4u);
+  ASSERT_EQ(p.function_set[0], node::type::add);
+  ASSERT_EQ(p.function_set[1], node::type::mul);
+  ASSERT_EQ(p.function_set[2], node::type::div);
+  ASSERT_EQ(p.function_set[3], node::type::sub);
+  ASSERT_EQ(p.transformer, transformer_t::sigmoid);
+  ASSERT_EQ(p.metric, metric_t::mae);
+  ASSERT_EQ(p.parsimony_coefficient, 0.001f);
+  ASSERT_EQ(p.p_crossover, 0.9f);
+  ASSERT_EQ(p.p_subtree_mutation, 0.01f);
+  ASSERT_EQ(p.p_hoist_mutation, 0.01f);
+  ASSERT_EQ(p.p_point_mutation, 0.01f);
+  ASSERT_EQ(p.p_point_replace, 0.05f);
+  ASSERT_EQ(p.max_samples, 1.0f);
+  ASSERT_EQ(p.feature_names.size(), 0u);
+  ASSERT_EQ(p.random_state, 0ull);
+}
+
+TEST(Genetic, p_reproduce) {
+  param p;
+  auto ret = p.p_reproduce();
+  ASSERT_TRUE(
+    raft::match(p.p_reproduce(), 0.07f, raft::CompareApprox<float>(0.0001f)));
+}
+
+}  // namespace genetic
+}  // namespace cuml

From d5bb75814e693ad8f86e394ea50783f4c349a320 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 3 Feb 2021 11:51:50 -0600
Subject: [PATCH 09/29] Relaxing Batched SilhouetteScore Test Constraint
 (#3452)

Authors:
  - Divye Gala (@divyegala)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3452
---
 python/cuml/test/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/test/test_metrics.py b/python/cuml/test/test_metrics.py
index be8990299e..1a3177619c 100644
--- a/python/cuml/test/test_metrics.py
+++ b/python/cuml/test/test_metrics.py
@@ -235,7 +235,7 @@ def test_silhouette_score_batched(metric, chunk_divider, labeled_clusters):
     cuml_score = cu_silhouette_score(X, labels, metric=metric,
                                      chunksize=int(X.shape[0]/chunk_divider))
     sk_score = sk_silhouette_score(X, labels, metric=metric)
-    assert_almost_equal(cuml_score, sk_score, decimal=3)
+    assert_almost_equal(cuml_score, sk_score, decimal=2)
 
 
 @pytest.mark.parametrize('metric', (

From d69f08c64eb1d804549c2a62afa1ce8b53a24372 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 3 Feb 2021 16:18:00 -0500
Subject: [PATCH 10/29] Fix failing sparse NN test in CI by allowing small
 number of index discrepancies (#3454)

Closes #3449

Authors:
  - Corey J. Nolet (@cjnolet)

Approvers:
  - William Hicks (@wphicks)
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/cuml/pull/3454
---
 python/cuml/test/test_nearest_neighbors.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index 456d5e088e..bb74cc4983 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -537,4 +537,10 @@ def test_nearest_neighbors_sparse(shape,
     # Jaccard & Chebyshev have a high potential for mismatched indices
     # due to duplicate distances. We can ignore the indices in this case.
     if metric not in ['jaccard', 'chebyshev']:
-        cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4)
+
+        # The actual neighbors returned in the presence of duplicate distances
+        # is non-deterministic. If we got to this point, the distances all
+        # match between cuml and sklearn. We set a reasonable threshold
+        # (.5% in this case) to allow differences from non-determinism.
+        diffs = abs(cuI - skI)
+        assert (len(diffs[diffs > 0]) / len(np.ravel(skI))) <= 0.005

From b90142fcd0f33dfcade4379e6b5299de6105865e Mon Sep 17 00:00:00 2001
From: levsnv <36520083+levsnv@users.noreply.github.com>
Date: Wed, 3 Feb 2021 21:16:49 -0800
Subject: [PATCH 11/29] FIL to use L1 cache when input columns don't fit into
 shared memory (#3370)

currently, the new tests take 4s each to run, despite the bare minimum size.
We might be able to accelerate FIL for this usecase somwhat (later), so might make sense to merge the slow tests in once they all pass.

Authors:
  - @levsnv

Approvers:
  - Andy Adinets (@canonizer)
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3370
---
 cpp/src/fil/common.cuh   |  46 +++++---
 cpp/src/fil/fil.cu       |  66 ++++++-----
 cpp/src/fil/infer.cu     | 243 +++++++++++++++++++++------------------
 cpp/src/fil/internal.cuh |   5 +-
 cpp/test/sg/fil_test.cu  |  13 ++-
 5 files changed, 214 insertions(+), 159 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 50cf6ed536..2461f08371 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,9 +40,6 @@ __host__ __device__ __forceinline__ int forest_num_nodes(int num_trees,
   return num_trees * tree_num_nodes(depth);
 }
 
-// FIL_TPB is the number of threads per block to use with FIL kernels
-const int FIL_TPB = 256;
-
 template <>
 __host__ __device__ __forceinline__ float base_node::output<float>() const {
   return val.f;
@@ -108,20 +105,42 @@ struct sparse_storage {
 typedef sparse_storage<sparse_node16> sparse_storage16;
 typedef sparse_storage<sparse_node8> sparse_storage8;
 
+/// all model parameters mostly required to compute shared memory footprint,
+/// also the footprint itself
+struct shmem_size_params {
+  /// for class probabilities, this is the number of classes considered;
+  /// num_classes is ignored otherwise
+  int num_classes = 1;
+  // leaf_algo determines what the leaves store (predict) and how FIL
+  // aggregates them into class margins/predicted class/regression answer
+  leaf_algo_t leaf_algo = leaf_algo_t::FLOAT_UNARY_BINARY;
+  /// how many columns an input row has
+  int num_cols = 0;
+  /// are the input columns are prefetched into shared
+  /// memory before inferring the row in question
+  bool cols_in_shmem = true;
+  /// n_items is the most items per thread that fit into shared memory
+  int n_items = 0;
+  /// shm_sz is the associated shared memory footprint
+  int shm_sz = INT_MAX;
+
+  __host__ __device__ size_t cols_shmem_size() {
+    return cols_in_shmem ? sizeof(float) * num_cols * n_items : 0;
+  }
+  void compute_smem_footprint();
+  template <int NITEMS>
+  size_t get_smem_footprint();
+  template <int NITEMS, leaf_algo_t leaf_algo>
+  size_t get_smem_footprint();
+};
+
 // predict_params are parameters for prediction
-struct predict_params {
+struct predict_params : shmem_size_params {
+  predict_params(shmem_size_params ssp) : shmem_size_params(ssp) {}
   // Model parameters.
-  int num_cols;
   algo_t algo;
-  int max_items;  // only set and used by infer()
   // number of outputs for the forest per each data row
   int num_outputs;
-  // for class probabilities, this is the number of classes considered
-  // ignored otherwise
-  int num_classes;
-  // leaf_algo determines what the leaves store (predict) and how FIL
-  // aggregates them into class margins/predicted class/regression answer
-  leaf_algo_t leaf_algo;
 
   // Data parameters.
   float* preds;
@@ -130,7 +149,6 @@ struct predict_params {
   size_t num_rows;
 
   // Other parameters.
-  int max_shm;
   int num_blocks;
 };
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 70dabaf856..a292c41e12 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -71,43 +71,60 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
 }
 
 struct forest {
-  void init_max_shm() {
+  void init_n_items(int device) {
     int max_shm_std = 48 * 1024;  // 48 KiB
-    int device = 0;
-    // TODO(canonizer): use raft::handle_t for this
-    CUDA_CHECK(cudaGetDevice(&device));
+    /// the most shared memory a kernel can request on the GPU in question
+    int max_shm = 0;
     CUDA_CHECK(cudaDeviceGetAttribute(
-      &max_shm_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+      &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
     // TODO(canonizer): use >48KiB shared memory if available
-    max_shm_ = std::min(max_shm_, max_shm_std);
+    max_shm = std::min(max_shm, max_shm_std);
+
+    // searching for the most items per block while respecting the shared
+    // memory limits creates a full linear programming problem.
+    // solving it in a single equation looks less tractable than this
+    shmem_size_params ssp = ssp_;
+    for (bool cols_in_shmem : {false, true}) {
+      ssp.cols_in_shmem = cols_in_shmem;
+      for (ssp.n_items = 1;
+           ssp.n_items <= (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1);
+           ++ssp.n_items) {
+        ssp.compute_smem_footprint();
+        if (ssp.shm_sz < max_shm) ssp_ = ssp;
+      }
+    }
+    ASSERT(max_shm >= ssp_.shm_sz,
+           "FIL out of shared memory. Perhaps the maximum number of \n"
+           "supported classes is exceeded? 5'000 would still be safe.");
   }
 
-  void init_fixed_block_count(const raft::handle_t& h, int blocks_per_sm) {
+  void init_fixed_block_count(int device, int blocks_per_sm) {
     int max_threads_per_sm, sm_count;
-    CUDA_CHECK(cudaDeviceGetAttribute(&max_threads_per_sm,
-                                      cudaDevAttrMaxThreadsPerMultiProcessor,
-                                      h.get_device()));
+    CUDA_CHECK(cudaDeviceGetAttribute(
+      &max_threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, device));
     int max_blocks_per_sm = max_threads_per_sm / FIL_TPB;
     ASSERT(blocks_per_sm <= max_blocks_per_sm,
            "on this GPU, FIL blocks_per_sm cannot exceed %d",
            max_blocks_per_sm);
     CUDA_CHECK(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount,
-                                      h.get_device()));
+                                      device));
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
   void init_common(const raft::handle_t& h, const forest_params_t* params) {
     depth_ = params->depth;
     num_trees_ = params->num_trees;
-    num_cols_ = params->num_cols;
     algo_ = params->algo;
     output_ = params->output;
     threshold_ = params->threshold;
     global_bias_ = params->global_bias;
-    leaf_algo_ = params->leaf_algo;
-    num_classes_ = params->num_classes;
-    init_max_shm();
-    init_fixed_block_count(h, params->blocks_per_sm);
+    ssp_.leaf_algo = params->leaf_algo;
+    ssp_.num_cols = params->num_cols;
+    ssp_.num_classes = params->num_classes;
+
+    int device = h.get_device();
+    init_n_items(device);  // n_items takes priority over blocks_per_sm
+    init_fixed_block_count(device, params->blocks_per_sm);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
@@ -115,15 +132,11 @@ struct forest {
   void predict(const raft::handle_t& h, float* preds, const float* data,
                size_t num_rows, bool predict_proba) {
     // Initialize prediction parameters.
-    predict_params params;
-    params.num_cols = num_cols_;
+    predict_params params(ssp_);
     params.algo = algo_;
     params.preds = preds;
     params.data = data;
     params.num_rows = num_rows;
-    params.max_shm = max_shm_;
-    params.num_classes = num_classes_;
-    params.leaf_algo = leaf_algo_;
     // fixed_block_count_ == 0 means the number of thread blocks is
     // proportional to the number of rows
     params.num_blocks = fixed_block_count_;
@@ -171,7 +184,7 @@ struct forest {
       // no threshold on probabilities
       ot = output_t(ot & ~output_t::CLASS);
 
-      switch (leaf_algo_) {
+      switch (ssp_.leaf_algo) {
         case leaf_algo_t::FLOAT_UNARY_BINARY:
           params.num_outputs = 2;
           complement_proba = true;
@@ -184,14 +197,14 @@ struct forest {
             "predict_proba not supported for multi-class gradient boosted "
             "decision trees (encountered in xgboost, scikit-learn, lightgbm)");
         case leaf_algo_t::CATEGORICAL_LEAF:
-          params.num_outputs = num_classes_;
+          params.num_outputs = ssp_.num_classes;
           do_transform = ot != output_t::RAW || global_bias_ != 0.0f;
           break;
         default:
           ASSERT(false, "internal error: invalid leaf_algo_");
       }
     } else {
-      if (leaf_algo_ == leaf_algo_t::FLOAT_UNARY_BINARY) {
+      if (ssp_.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) {
         do_transform = ot != output_t::RAW || global_bias_ != 0.0f;
       } else {
         // GROVE_PER_CLASS, CATEGORICAL_LEAF: moot since choosing best class and
@@ -222,14 +235,11 @@ struct forest {
 
   int num_trees_ = 0;
   int depth_ = 0;
-  int num_cols_ = 0;
   algo_t algo_ = algo_t::NAIVE;
-  int max_shm_ = 0;
   output_t output_ = output_t::RAW;
   float threshold_ = 0.5;
   float global_bias_ = 0;
-  leaf_algo_t leaf_algo_ = leaf_algo_t::FLOAT_UNARY_BINARY;
-  int num_classes_ = 1;
+  shmem_size_params ssp_;
   int fixed_block_count_ = 0;
 };
 
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 1c46c76aeb..52f49efd6a 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,21 +69,21 @@ struct ArgMax {
 
 template <int NITEMS, typename output_type, typename tree_type>
 __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
-  tree_type tree, float* sdata, int cols) {
+  tree_type tree, const float* input, int cols, int n_rows) {
   int curr[NITEMS];
-  int mask = (1 << NITEMS) - 1;  // all active
+  // the first n_rows are active
+  int mask = ((1 << n_rows) - 1) << (NITEMS - n_rows);
   for (int j = 0; j < NITEMS; ++j) curr[j] = 0;
   do {
 #pragma unroll
     for (int j = 0; j < NITEMS; ++j) {
       auto n = tree[curr[j]];
-      if (n.is_leaf()) {
-        mask &= ~(1 << j);
-        continue;
+      mask &= ~(n.is_leaf() << j);
+      if (!n.is_leaf()) {
+        float val = input[j * cols + n.fid()];
+        bool cond = isnan(val) ? !n.def_left() : val >= n.thresh();
+        curr[j] = n.left(curr[j]) + cond;
       }
-      float val = sdata[j * cols + n.fid()];
-      bool cond = isnan(val) ? !n.def_left() : val >= n.thresh();
-      curr[j] = n.left(curr[j]) + cond;
     }
   } while (mask != 0);
   vec<NITEMS, output_type> out;
@@ -98,14 +98,13 @@ __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
 }
 
 template <typename output_type, typename tree_type>
-__device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree,
-                                                              float* sdata,
-                                                              int cols) {
+__device__ __forceinline__ vec<1, output_type> infer_one_tree(
+  tree_type tree, const float* input, int cols, int rows) {
   int curr = 0;
   for (;;) {
     auto n = tree[curr];
     if (n.is_leaf()) break;
-    float val = sdata[n.fid()];
+    float val = input[n.fid()];
     bool cond = isnan(val) ? !n.def_left() : val >= n.thresh();
     curr = n.left(curr) + cond;
   }
@@ -167,9 +166,10 @@ struct tree_aggregator_t {
 
   /** 
   num_classes is used for other template parameters */
-  __device__ __forceinline__ tree_aggregator_t(int num_classes,
-                                               void* shared_workspace, size_t)
-    : tmp_storage(shared_workspace) {}
+  __device__ __forceinline__ tree_aggregator_t(predict_params params,
+                                               void* accumulate_workspace,
+                                               void* finalize_workspace)
+    : tmp_storage(finalize_workspace) {}
 
   __device__ __forceinline__ void accumulate(
     vec<NITEMS, float> single_tree_prediction, int tree) {
@@ -197,8 +197,12 @@ struct finalize_block {
     : tmp_storage(tmp_storage_), num_classes(num_classes_) {}
 
   template <int NITEMS>
-  static size_t smem_footprint() {
+  static __host__ __device__ size_t smem_footprint() {
+#ifdef __CUDA_ARCH__
+    return sizeof(typename BlockReduceBestClass<NITEMS>::TempStorage);
+#else
     return sizeof(typename BlockReduceHostBestClass<NITEMS>::TempStorage);
+#endif
   }
 
   template <int NITEMS>
@@ -219,6 +223,7 @@ struct finalize_block {
 template <int NITEMS>
 struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> : finalize_block {
   vec<NITEMS, float> acc;
+  vec<NITEMS, float>* per_thread;
 
   static size_t smem_finalize_footprint(int num_classes) {
     size_t phase1 =
@@ -229,9 +234,11 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> : finalize_block {
 
   static size_t smem_accumulate_footprint(int num_classes) { return 0; }
 
-  __device__ __forceinline__ tree_aggregator_t(int num_classes_,
-                                               void* shared_workspace, size_t)
-    : finalize_block(shared_workspace, num_classes_) {}
+  __device__ __forceinline__ tree_aggregator_t(predict_params params,
+                                               void* accumulate_workspace,
+                                               void* finalize_workspace)
+    : finalize_block(finalize_workspace, params.num_classes),
+      per_thread((vec<NITEMS, float>*)finalize_workspace) {}
 
   __device__ __forceinline__ void accumulate(
     vec<NITEMS, float> single_tree_prediction, int tree) {
@@ -243,7 +250,6 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> : finalize_block {
                                            int num_outputs) {
     __syncthreads();  // free up input row
     // load margin into shared memory
-    auto per_thread = (vec<NITEMS, float>*)tmp_storage;
     per_thread[threadIdx.x] = acc;
     __syncthreads();
     acc = multi_sum<6>(per_thread, num_classes, blockDim.x / num_classes);
@@ -260,21 +266,19 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_MANY_CLASSES>
   vec<NITEMS, float>* per_class_margin;
 
   static size_t smem_finalize_footprint(int num_classes) {
-    size_t phase1 = num_classes * sizeof(vec<NITEMS, float>);
-    size_t phase2 = finalize_block::smem_footprint<NITEMS>();
-    return std::max(phase1, phase2);
+    // not accounting for lingering accumulate_footprint during finalize()
+    return finalize_block::smem_footprint<NITEMS>();
   }
 
-  static size_t smem_accumulate_footprint(int num_classes) {
+  static __host__ __device__ size_t smem_accumulate_footprint(int num_classes) {
     return num_classes * sizeof(vec<NITEMS, float>);
   }
 
-  __device__ __forceinline__ tree_aggregator_t(int num_classes_,
-                                               void* shared_workspace,
-                                               size_t data_row_size)
-    : finalize_block(shared_workspace, num_classes_),
-      per_class_margin(
-        (vec<NITEMS, float>*)((char*)shared_workspace + data_row_size)) {
+  __device__ __forceinline__ tree_aggregator_t(predict_params params,
+                                               void* accumulate_workspace,
+                                               void* finalize_workspace)
+    : finalize_block(finalize_workspace, params.num_classes),
+      per_class_margin((vec<NITEMS, float>*)accumulate_workspace) {
     for (int c = threadIdx.x; c < num_classes; c += blockDim.x)
       per_class_margin[c] = vec<NITEMS, float>();  // initialize to 0.0f
     // __syncthreads() is called in infer_k
@@ -309,17 +313,17 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   int num_classes;
 
   static size_t smem_finalize_footprint(int num_classes) {
-    return sizeof(int) * num_classes * NITEMS;
+    // not accounting for lingering accumulate_footprint during finalize()
+    return 0;
   }
   static size_t smem_accumulate_footprint(int num_classes) {
-    return smem_finalize_footprint(num_classes);
+    return sizeof(int) * num_classes * NITEMS;
   }
 
-  __device__ __forceinline__ tree_aggregator_t(int num_classes_,
-                                               void* shared_workspace,
-                                               size_t data_row_size)
-    : num_classes(num_classes_),
-      votes((int*)(data_row_size + (char*)shared_workspace)) {
+  __device__ __forceinline__ tree_aggregator_t(predict_params params,
+                                               void* accumulate_workspace,
+                                               void* finalize_workspace)
+    : num_classes(params.num_classes), votes((int*)accumulate_workspace) {
     for (int c = threadIdx.x; c < num_classes; c += FIL_TPB * NITEMS)
 #pragma unroll
       for (int item = 0; item < NITEMS; ++item) votes[c * NITEMS + item] = 0;
@@ -373,25 +377,30 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   }
 };
 
-template <int NITEMS, leaf_algo_t leaf_algo, class storage_type>
+template <int NITEMS, leaf_algo_t leaf_algo, bool cols_in_shmem,
+          class storage_type>
 __global__ void infer_k(storage_type forest, predict_params params) {
   extern __shared__ char smem[];
   float* sdata = (float*)smem;
+  int num_cols = params.num_cols;
   for (size_t block_row0 = blockIdx.x * NITEMS; block_row0 < params.num_rows;
        block_row0 += NITEMS * gridDim.x) {
-    // cache the row for all threads to reuse
-    for (size_t j = 0; j < NITEMS; ++j) {
-      size_t row = block_row0 + j;
+    size_t num_input_rows = min((size_t)NITEMS, params.num_rows - block_row0);
+    const float* block_input = params.data + block_row0 * num_cols;
+    if (cols_in_shmem) {
+      // cache the row for all threads to reuse
+      size_t feature = 0;
 #pragma unroll
-      for (int col = threadIdx.x; col < params.num_cols; col += blockDim.x) {
-        sdata[j * params.num_cols + col] =
-          row < params.num_rows ? params.data[row * params.num_cols + col]
-                                : 0.0f;
-      }
+      for (feature = threadIdx.x; feature < num_input_rows * num_cols;
+           feature += blockDim.x)
+        sdata[feature] = block_input[feature];
+#pragma unroll
+      for (; feature < NITEMS * num_cols; feature += blockDim.x)
+        sdata[feature] = 0.0f;
     }
 
     tree_aggregator_t<NITEMS, leaf_algo> acc(
-      params.num_classes, sdata, params.num_cols * NITEMS * sizeof(float));
+      params, (char*)sdata + params.cols_shmem_size(), sdata);
 
     __syncthreads();  // for both row cache init and acc init
 
@@ -404,99 +413,88 @@ __global__ void infer_k(storage_type forest, predict_params params) {
       */
       if (j < forest.num_trees()) {
         acc.accumulate(infer_one_tree<NITEMS, leaf_output_t<leaf_algo>::T>(
-                         forest[j], sdata, params.num_cols),
+                         forest[j], cols_in_shmem ? sdata : block_input,
+                         num_cols, num_input_rows),
                        j);
       }
       if (leaf_algo == GROVE_PER_CLASS_MANY_CLASSES) __syncthreads();
     }
-    acc.finalize(params.preds + params.num_outputs * block_row0,
-                 min((size_t)NITEMS, params.num_rows - block_row0),
+    acc.finalize(params.preds + params.num_outputs * block_row0, num_input_rows,
                  params.num_outputs);
     __syncthreads();  // free up acc's shared memory resources for next row set
   }
 }
 
 template <int NITEMS, leaf_algo_t leaf_algo>
-size_t get_smem_footprint(predict_params params) {
+size_t shmem_size_params::get_smem_footprint() {
   size_t finalize_footprint =
-    tree_aggregator_t<NITEMS, leaf_algo>::smem_finalize_footprint(
-      params.num_classes);
+    tree_aggregator_t<NITEMS, leaf_algo>::smem_finalize_footprint(num_classes);
   size_t accumulate_footprint =
-    sizeof(float) * params.num_cols * NITEMS +
     tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(
-      params.num_classes);
+      num_classes) +
+    cols_shmem_size();
 
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
-template <leaf_algo_t leaf_algo, typename storage_type>
-void infer_k_launcher(storage_type forest, predict_params params,
-                      cudaStream_t stream, int blockdim_x) {
-  const int MAX_BATCH_ITEMS = 4;
-  params.max_items =
-    params.algo == algo_t::BATCH_TREE_REORG ? MAX_BATCH_ITEMS : 1;
-
-  /** searching for the most items per block while respecting the shared
-  * memory limits creates a full linear programming problem.
-  * solving it in a single equation looks less tractable than this */
-  int num_items = 0;
-  size_t shm_sz = 0;
-  for (int nitems = 1; nitems <= params.max_items; ++nitems) {
-    size_t peak_footprint;
-    switch (nitems) {
-      case 1:
-        peak_footprint = get_smem_footprint<1, leaf_algo>(params);
-        break;
-      case 2:
-        peak_footprint = get_smem_footprint<2, leaf_algo>(params);
-        break;
-      case 3:
-        peak_footprint = get_smem_footprint<3, leaf_algo>(params);
-        break;
-      case 4:
-        peak_footprint = get_smem_footprint<4, leaf_algo>(params);
-        break;
-      default:
-        ASSERT(false, "internal error: nitems > 4");
-    }
-    // for data row
-    if (peak_footprint <= params.max_shm) {
-      num_items = nitems;
-      shm_sz = peak_footprint;
-    }
+template <int NITEMS>
+size_t shmem_size_params::get_smem_footprint() {
+  switch (leaf_algo) {
+    case FLOAT_UNARY_BINARY:
+      return get_smem_footprint<NITEMS, FLOAT_UNARY_BINARY>();
+    case CATEGORICAL_LEAF:
+      return get_smem_footprint<NITEMS, CATEGORICAL_LEAF>();
+    case GROVE_PER_CLASS:
+      if (num_classes > FIL_TPB)
+        return get_smem_footprint<NITEMS, GROVE_PER_CLASS_MANY_CLASSES>();
+      return get_smem_footprint<NITEMS, GROVE_PER_CLASS_FEW_CLASSES>();
+    default:
+      ASSERT(false, "internal error: unexpected leaf_algo_t");
   }
-  if (num_items == 0) {
-    int given_num_cols = params.num_cols;
-    // starting with maximum that might fit in shared memory, in case
-    // given_num_cols is a random large int
-    params.num_cols = params.max_shm / sizeof(float);
-    // since we're crashing, this will not take too long
-    while (params.num_cols > 0 &&
-           get_smem_footprint<1, leaf_algo>(params) > params.max_shm) {
-      --params.num_cols;
-    }
-    ASSERT(false, "p.num_cols == %d: too many features, only %d allowed",
-           given_num_cols, params.num_cols);
+}
+
+void shmem_size_params::compute_smem_footprint() {
+  switch (n_items) {
+    case 1:
+      shm_sz = get_smem_footprint<1>();
+      break;
+    case 2:
+      shm_sz = get_smem_footprint<2>();
+      break;
+    case 3:
+      shm_sz = get_smem_footprint<3>();
+      break;
+    case 4:
+      shm_sz = get_smem_footprint<4>();
+      break;
+    default:
+      ASSERT(false, "internal error: n_items > 4");
   }
-  params.num_blocks = params.num_blocks != 0
-                        ? params.num_blocks
-                        : raft::ceildiv(int(params.num_rows), num_items);
-  switch (num_items) {
+}
+
+template <leaf_algo_t leaf_algo, bool cols_in_shmem, typename storage_type>
+void infer_k_nitems_launcher(storage_type forest, predict_params params,
+                             cudaStream_t stream, int block_dim_x) {
+  switch (params.n_items) {
     case 1:
-      infer_k<1, leaf_algo>
-        <<<params.num_blocks, blockdim_x, shm_sz, stream>>>(forest, params);
+      infer_k<1, leaf_algo, cols_in_shmem>
+        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
+                                                                    params);
       break;
     case 2:
-      infer_k<2, leaf_algo>
-        <<<params.num_blocks, blockdim_x, shm_sz, stream>>>(forest, params);
+      infer_k<2, leaf_algo, cols_in_shmem>
+        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
+                                                                    params);
       break;
     case 3:
-      infer_k<3, leaf_algo>
-        <<<params.num_blocks, blockdim_x, shm_sz, stream>>>(forest, params);
+      infer_k<3, leaf_algo, cols_in_shmem>
+        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
+                                                                    params);
       break;
     case 4:
-      infer_k<4, leaf_algo>
-        <<<params.num_blocks, blockdim_x, shm_sz, stream>>>(forest, params);
+      infer_k<4, leaf_algo, cols_in_shmem>
+        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
+                                                                    params);
       break;
     default:
       ASSERT(false, "internal error: nitems > 4");
@@ -504,6 +502,21 @@ void infer_k_launcher(storage_type forest, predict_params params,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
+template <leaf_algo_t leaf_algo, typename storage_type>
+void infer_k_launcher(storage_type forest, predict_params params,
+                      cudaStream_t stream, int blockdim_x) {
+  params.num_blocks = params.num_blocks != 0
+                        ? params.num_blocks
+                        : raft::ceildiv(int(params.num_rows), params.n_items);
+  if (params.cols_in_shmem) {
+    infer_k_nitems_launcher<leaf_algo, true>(forest, params, stream,
+                                             blockdim_x);
+  } else {
+    infer_k_nitems_launcher<leaf_algo, false>(forest, params, stream,
+                                              blockdim_x);
+  }
+}
+
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
   switch (params.leaf_algo) {
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 4bfbe9d088..2cd698e339 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -233,6 +233,9 @@ struct forest_params_t {
   int blocks_per_sm;
 };
 
+/// FIL_TPB is the number of threads per block to use with FIL kernels
+const int FIL_TPB = 256;
+
 /** init_dense uses params and nodes to initialize the dense forest stored in pf
  *  @param h cuML handle used by this function
  *  @param pf pointer to where to store the newly created forest
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index a43dc8e6e2..b8630d3155 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -710,6 +710,17 @@ std::vector<FilTestParams> predict_dense_inputs = {
                   num_trees = 512, num_classes = 512),
   FIL_TEST_PARAMS(leaf_algo = GROVE_PER_CLASS, blocks_per_sm = 4,
                   num_trees = 512, num_classes = 512),
+  FIL_TEST_PARAMS(num_cols = 100'000, depth = 5, num_trees = 1,
+                  leaf_algo = FLOAT_UNARY_BINARY),
+  FIL_TEST_PARAMS(num_rows = 101, num_cols = 100'000, depth = 5, num_trees = 3,
+                  algo = BATCH_TREE_REORG, leaf_algo = GROVE_PER_CLASS,
+                  num_classes = 3),
+  FIL_TEST_PARAMS(num_rows = 102, num_cols = 100'000, depth = 5,
+                  num_trees = FIL_TPB + 1, algo = BATCH_TREE_REORG,
+                  leaf_algo = GROVE_PER_CLASS, num_classes = FIL_TPB + 1),
+  FIL_TEST_PARAMS(num_rows = 103, num_cols = 100'000, depth = 5, num_trees = 1,
+                  algo = BATCH_TREE_REORG, leaf_algo = CATEGORICAL_LEAF,
+                  num_classes = 3),
 };
 
 TEST_P(PredictDenseFilTest, Predict) { compare(); }

From a01a55dcb01de5d08e849972832cb24366841413 Mon Sep 17 00:00:00 2001
From: William Hicks <wphicks@users.noreply.github.com>
Date: Thu, 4 Feb 2021 02:38:06 -0500
Subject: [PATCH 12/29] Mark kbinsdiscretizer quantile tests as xfail (#3450)

Mark kbinsdiscretizer quantile tests as xfail to avoid https://github.com/cupy/cupy/issues/4607

Authors:
  - William Hicks (@wphicks)

Approvers:
  - John Zedlewski (@JohnZed)
  - Michael Demoret (@mdemoret-nv)

URL: https://github.com/rapidsai/cuml/pull/3450
---
 python/cuml/test/test_preprocessing.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/cuml/test/test_preprocessing.py b/python/cuml/test/test_preprocessing.py
index a27fbdb08a..ea4d3946e5 100644
--- a/python/cuml/test/test_preprocessing.py
+++ b/python/cuml/test/test_preprocessing.py
@@ -555,7 +555,15 @@ def test_robust_scale_sparse(sparse_clf_dataset,  # noqa: F811
 @check_cupy8('pytest')
 @pytest.mark.parametrize("n_bins", [5, 20])
 @pytest.mark.parametrize("encode", ['ordinal', 'onehot-dense', 'onehot'])
-@pytest.mark.parametrize("strategy", ['uniform', 'quantile', 'kmeans'])
+@pytest.mark.parametrize("strategy", [
+    'uniform',
+    pytest.param('quantile', marks=pytest.mark.xfail(
+        strict=False,
+        reason='Bug in cupy.percentile'
+        ' (https://github.com/cupy/cupy/issues/4607)'
+    )),
+    'kmeans'
+])
 def test_kbinsdiscretizer(blobs_dataset, n_bins,  # noqa: F811
                           encode, strategy):
     X_np, X = blobs_dataset

From b6e6450b7205813e2d405808e9c8e7cca0a9d254 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 4 Feb 2021 15:26:04 +0100
Subject: [PATCH 13/29] Fix memory bug for SVM with large n_rows (#3420)

closes #3233

This PR introduces 64bit int arithmetic while accessing SVM kernel tile to avoid overflow.

Authors:
  - Tamas Bela Feher (@tfeher)
  - John Zedlewski (@JohnZed)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3420
---
 cpp/cmake/Dependencies.cmake            |  2 +-
 cpp/src/svm/kernelcache.cuh             | 27 +++++++++++-----
 cpp/src/svm/smoblocksolve.cuh           | 10 +++---
 cpp/src_prims/cache/cache_util.cuh      | 11 ++++---
 cpp/src_prims/matrix/kernelmatrices.cuh | 41 ++++++++++++-----------
 cpp/test/sg/svc_test.cu                 | 43 ++++++++++++++++++++++++-
 6 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
index 414c91691f..007a257686 100644
--- a/cpp/cmake/Dependencies.cmake
+++ b/cpp/cmake/Dependencies.cmake
@@ -39,7 +39,7 @@ else(DEFINED ENV{RAFT_PATH})
 
   ExternalProject_Add(raft
     GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           16a3b8eda8b8c5948d2aad9dd9fcd09d7697e2f3
+    GIT_TAG           a61cbed999ec3bb9143bbd2c6c455f946d9e1330
     PREFIX            ${RAFT_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index 9c5b72924d..7d512389fb 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,14 +108,23 @@ class KernelCache {
       cublas_handle(handle.get_cublas_handle()),
       d_num_selected_out(handle.get_device_allocator(), handle.get_stream(), 1),
       d_temp_storage(handle.get_device_allocator(), handle.get_stream()),
-      x_ws(handle.get_device_allocator(), handle.get_stream(), n_ws * n_cols),
-      tile(handle.get_device_allocator(), handle.get_stream(), n_ws * n_rows),
+      x_ws(handle.get_device_allocator(), handle.get_stream()),
+      tile(handle.get_device_allocator(), handle.get_stream()),
       unique_idx(handle.get_device_allocator(), handle.get_stream(), n_ws),
       k_col_idx(handle.get_device_allocator(), handle.get_stream(), n_ws),
       ws_cache_idx(handle.get_device_allocator(), handle.get_stream(), n_ws) {
     ASSERT(kernel != nullptr, "Kernel pointer required for KernelCache!");
     stream = handle.get_stream();
 
+    size_t kernel_tile_size = (size_t)n_ws * n_rows;
+    CUML_LOG_DEBUG("Allocating kernel tile, size: %zu MiB",
+                   kernel_tile_size * sizeof(math_t) / (1024 * 1024));
+    tile.resize(kernel_tile_size, handle.get_stream());
+
+    size_t x_ws_tile_size = (size_t)n_ws * n_cols;
+    CUML_LOG_DEBUG("Allocating x_ws, size: %zu KiB", x_ws_tile_size / (1024));
+    x_ws.resize(x_ws_tile_size, handle.get_stream());
+
     // Default kernel_column_idx map for SVC
     MLCommon::LinAlg::range(k_col_idx.data(), n_ws, stream);
 
@@ -197,9 +206,10 @@ class KernelCache {
                              stream);  // cache stream
 
         // collect training vectors for kernel elements that needs to be calculated
-        raft::matrix::copyRows(x, n_rows, n_cols, x_ws.data(), ws_idx_new,
-                               non_cached, stream, false);
-        math_t *tile_new = tile.data() + n_cached * n_rows;
+        raft::matrix::copyRows<math_t, int, size_t>(x, n_rows, n_cols,
+                                                    x_ws.data(), ws_idx_new,
+                                                    non_cached, stream, false);
+        math_t *tile_new = tile.data() + (size_t)n_cached * n_rows;
         (*kernel)(x, n_rows, n_cols, x_ws.data(), non_cached, tile_new, stream);
         // We need AssignCacheIdx to be finished before calling StoreCols
         cache.StoreVecs(tile_new, n_rows, non_cached,
@@ -208,8 +218,9 @@ class KernelCache {
     } else {
       if (n_unique > 0) {
         // collect all the feature vectors in the working set
-        raft::matrix::copyRows(x, n_rows, n_cols, x_ws.data(),
-                               unique_idx.data(), n_unique, stream, false);
+        raft::matrix::copyRows<math_t, int, size_t>(
+          x, n_rows, n_cols, x_ws.data(), unique_idx.data(), n_unique, stream,
+          false);
         (*kernel)(x, n_rows, n_cols, x_ws.data(), n_unique, tile.data(),
                   stream);
       }
diff --git a/cpp/src/svm/smoblocksolve.cuh b/cpp/src/svm/smoblocksolve.cuh
index a876ea0088..26324eb248 100644
--- a/cpp/src/svm/smoblocksolve.cuh
+++ b/cpp/src/svm/smoblocksolve.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,19 +170,19 @@ __global__ __launch_bounds__(WSIZE) void SmoBlockSolve(
   __shared__ math_t tmp_u, tmp_l;
   __shared__ math_t Kd[WSIZE];  // diagonal elements of the kernel matrix
   __shared__ int k_col_idx_map[WSIZE];
-  __shared__ int k_col_idx_u, k_col_idx_l;
+  __shared__ int64_t k_col_idx_u, k_col_idx_l;
 
   int tid = threadIdx.x;
   int idx = ws_idx[tid];
-  int n_rows = (svmType == EPSILON_SVR) ? n_train / 2 : n_train;
+  int64_t n_rows = (svmType == EPSILON_SVR) ? n_train / 2 : n_train;
 
   // Consult KernelCache::GetTile for the layout of the kernel matrix
   // kernel matrix row and colums indices for workspace vector ws_idx[tid]
   // k_row_idx \in [0..n_rows-1]
-  int k_row_idx =
+  int64_t k_row_idx =
     (svmType == EPSILON_SVR && idx >= n_rows) ? idx - n_rows : idx;
   // k_col_idx \in [0..n_unique-1]
-  int k_col_idx = (svmType == C_SVC) ? tid : kColIdx[tid];
+  int64_t k_col_idx = (svmType == C_SVC) ? tid : kColIdx[tid];
 
   k_col_idx_map[tid] = k_col_idx;
 
diff --git a/cpp/src_prims/cache/cache_util.cuh b/cpp/src_prims/cache/cache_util.cuh
index 3cc5b27a2f..a857e53c29 100644
--- a/cpp/src_prims/cache/cache_util.cuh
+++ b/cpp/src_prims/cache/cache_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,10 +49,10 @@ __global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx,
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    int out_col = tid / n_vec;  // col idx
-    int cache_col = cache_idx[out_col];
+    size_t out_col = tid / n_vec;  // col idx
+    size_t cache_col = cache_idx[out_col];
     if (cache_idx[out_col] >= 0) {
-      if (row + out_col * n_vec < n_vec * n) {
+      if (row + out_col * n_vec < (size_t)n_vec * n) {
         out[tid] = cache[row + cache_col * n_vec];
       }
     }
@@ -99,7 +99,8 @@ __global__ void store_vecs(const math_t *tile, int n_tile, int n_vec,
     // We ignore negative values. The rest of the checks should be fulfilled
     // if the cache is used properly
     if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) {
-      cache[row + cache_col * n_vec] = tile[row + data_col * n_vec];
+      cache[row + (size_t)cache_col * n_vec] =
+        tile[row + (size_t)data_col * n_vec];
     }
   }
 }
diff --git a/cpp/src_prims/matrix/kernelmatrices.cuh b/cpp/src_prims/matrix/kernelmatrices.cuh
index 3c65a789a9..2e20d9c243 100644
--- a/cpp/src_prims/matrix/kernelmatrices.cuh
+++ b/cpp/src_prims/matrix/kernelmatrices.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,9 +35,10 @@ using namespace MLCommon;
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel_nopad(math_t *inout, int len, exp_t exponent,
-                                        math_t gain, math_t offset) {
-  for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+__global__ void polynomial_kernel_nopad(math_t *inout, size_t len,
+                                        exp_t exponent, math_t gain,
+                                        math_t offset) {
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
        tid += blockDim.x * gridDim.x) {
     inout[tid] = pow(gain * inout[tid] + offset, exponent);
   }
@@ -56,9 +57,9 @@ __global__ void polynomial_kernel_nopad(math_t *inout, int len, exp_t exponent,
 template <typename math_t, typename exp_t>
 __global__ void polynomial_kernel(math_t *inout, int ld, int rows, int cols,
                                   exp_t exponent, math_t gain, math_t offset) {
-  for (int tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y)
-    for (int tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
          tidx += blockDim.x * gridDim.x) {
       inout[tidx + tidy * ld] =
         pow(gain * inout[tidx + tidy * ld] + offset, exponent);
@@ -73,9 +74,9 @@ __global__ void polynomial_kernel(math_t *inout, int ld, int rows, int cols,
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel_nopad(math_t *inout, int len, math_t gain,
+__global__ void tanh_kernel_nopad(math_t *inout, size_t len, math_t gain,
                                   math_t offset) {
-  for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
        tid += blockDim.x * gridDim.x) {
     inout[tid] = tanh(gain * inout[tid] + offset);
   }
@@ -93,9 +94,9 @@ __global__ void tanh_kernel_nopad(math_t *inout, int len, math_t gain,
 template <typename math_t>
 __global__ void tanh_kernel(math_t *inout, int ld, int rows, int cols,
                             math_t gain, math_t offset) {
-  for (int tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y)
-    for (int tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
          tidx += blockDim.x * gridDim.x) {
       inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
     }
@@ -113,9 +114,9 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
   void applyKernel(math_t *inout, int ld, int rows, int cols,
                    cudaStream_t stream) {
     if (ld == cols)
-      polynomial_kernel_nopad<<<raft::ceildiv(rows * cols, 128), 128, 0,
-                                stream>>>(inout, rows * cols, exponent, gain,
-                                          offset);
+      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128),
+                                128, 0, stream>>>(inout, rows * cols, exponent,
+                                                  gain, offset);
     else
       polynomial_kernel<<<dim3(raft::ceildiv(rows, 32), raft::ceildiv(cols, 4),
                                1),
@@ -183,8 +184,8 @@ class TanhKernel : public GramMatrixBase<math_t> {
   void applyKernel(math_t *inout, int ld, int rows, int cols,
                    cudaStream_t stream) {
     if (ld == cols)
-      tanh_kernel_nopad<<<raft::ceildiv(rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain, offset);
+      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128,
+                          0, stream>>>(inout, rows * cols, gain, offset);
     else
       tanh_kernel<<<dim3(raft::ceildiv(rows, 32), raft::ceildiv(cols, 4), 1),
                     dim3(32, 4, 1), 0, stream>>>(inout, ld, rows, cols, gain,
@@ -245,8 +246,8 @@ class RBFKernel : public GramMatrixBase<math_t> {
   void applyKernel(math_t *inout, int ld, int rows, int cols,
                    cudaStream_t stream) {
     if (ld == cols)
-      rbf_kernel_nopad<<<raft::ceildiv(rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain);
+      rbf_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128,
+                         0, stream>>>(inout, rows * cols, gain);
     else
       rbf_kernel<<<dim3(raft::ceildiv(rows, 32), raft::ceildiv(cols, 4), 1),
                    dim3(32, 4, 1), 0, stream>>>(inout, ld, rows, cols, gain);
@@ -299,11 +300,13 @@ class RBFKernel : public GramMatrixBase<math_t> {
                 int ld_out) {
     typedef cutlass::Shape<8, 128, 128> OutputTile_t;
     math_t gain = this->gain;
-    auto fin_op = [gain] __device__(math_t d_val, int idx) {
+    using index_t = int64_t;
+
+    auto fin_op = [gain] __device__(math_t d_val, index_t idx) {
       return exp(-gain * d_val);
     };
     Distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t,
-                       math_t, math_t, OutputTile_t>(
+                       math_t, math_t, OutputTile_t, decltype(fin_op), index_t>(
       const_cast<math_t *>(x1), const_cast<math_t *>(x2), out, n1, n2, n_cols,
       NULL, 0, fin_op, stream, false);
   }
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index 7881ea9f1f..7dc0b54eb6 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1142,6 +1142,47 @@ TYPED_TEST(SmoSolverTest, MemoryLeak) {
   EXPECT_EQ(delta, 0);
 }
 
+TYPED_TEST(SmoSolverTest, DISABLED_MillionRows) {
+  // Stress test the kernel matrix calculation by calculating a kernel tile
+  // with more the 2.8B elemnts. This would fail with int32 adressing. The test
+  // is currently disabled because the memory usage might be prohibitive on CI
+  // The test will be enabled once https://github.com/rapidsai/cuml/pull/2449
+  // is merged, that PR would reduce the kernel tile memory size.
+  std::vector<std::pair<blobInput, TypeParam>> data{
+    {blobInput{1, 0.001, KernelParams{RBF, 3, 1, 0}, 2800000, 4}, 98},
+    {blobInput{1, 0.001, KernelParams{LINEAR, 3, 1, 0}, 2800000, 4}, 98},
+    {blobInput{1, 0.001, KernelParams{POLYNOMIAL, 3, 1, 0}, 2800000, 4}, 98},
+    {blobInput{1, 0.001, KernelParams{TANH, 3, 1, 0}, 2800000, 4}, 98}};
+  auto allocator = this->handle.get_device_allocator();
+
+  if (sizeof(TypeParam) == 8) {
+    GTEST_SKIP();  // Skip the test for double imput
+  }
+  for (auto d : data) {
+    auto p = d.first;
+    SCOPED_TRACE(p);
+    // explicit centers for the blobs
+    device_buffer<float> centers(allocator, this->stream, 2 * p.n_cols);
+    thrust::device_ptr<float> thrust_ptr(centers.data());
+    thrust::fill(thrust::cuda::par.on(this->stream), thrust_ptr,
+                 thrust_ptr + p.n_cols, -5.0f);
+    thrust::fill(thrust::cuda::par.on(this->stream), thrust_ptr + p.n_cols,
+                 thrust_ptr + 2 * p.n_cols, +5.0f);
+
+    device_buffer<TypeParam> x(allocator, this->stream, p.n_rows * p.n_cols);
+    device_buffer<TypeParam> y(allocator, this->stream, p.n_rows);
+    device_buffer<TypeParam> y_pred(allocator, this->stream, p.n_rows);
+    make_blobs(this->handle, x.data(), y.data(), p.n_rows, p.n_cols, 2,
+               centers.data());
+    const int max_iter = 2;
+    SVC<TypeParam> svc(this->handle, p.C, p.tol, p.kernel_params, 0, max_iter,
+                       50, CUML_LEVEL_DEBUG);
+    svc.fit(x.data(), p.n_rows, p.n_cols, y.data());
+    // predict on the same dataset
+    svc.predict(x.data(), p.n_rows, p.n_cols, y_pred.data());
+  }
+}
+
 template <typename math_t>
 struct SvrInput {
   svmParameter param;

From 6844932f300ca76a594d17c99061c2c6bc84c512 Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Thu, 4 Feb 2021 18:37:56 +0100
Subject: [PATCH 14/29] Exempting thirdparty code from copyright checks (#3453)

This PR will exclude all files from `_thirdparty` from the copyright check, since these files are under different licenses.

Authors:
  - Micka (@lowener)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3453
---
 ci/checks/copyright.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index 4cc8840f65..5384dfca23 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -38,6 +38,9 @@
     re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$")
 ]
+ExemptFiles = [
+    re.compile(r"_thirdparty")
+]
 
 # this will break starting at year 10000, which is probably OK :)
 CheckSimple = re.compile(
@@ -53,6 +56,9 @@ def checkThisFile(f):
         return False
     if gitutils and gitutils.isFileEmpty(f):
         return False
+    for exempt in ExemptFiles:
+        if exempt.search(f):
+            return False
     for checker in FilesToCheck:
         if checker.search(f):
             return True

From 4f47fd0b5077b8f773a86313246a0bdc9e2bcd65 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 4 Feb 2021 13:47:44 -0500
Subject: [PATCH 15/29] Fix naive bayes inputs (#3448)

Opening a new PR for these changes because I don't seem to be able to have push access to John Z's branch.

Authors:
  - Corey J. Nolet (@cjnolet)
  - John Zedlewski (@JohnZed)

Approvers:
  - Michael Demoret (@mdemoret-nv)

URL: https://github.com/rapidsai/cuml/pull/3448
---
 python/cuml/naive_bayes/naive_bayes.py    | 208 ++++++++++++++--------
 python/cuml/test/dask/test_naive_bayes.py |  10 +-
 python/cuml/test/test_naive_bayes.py      |  35 +++-
 3 files changed, 174 insertions(+), 79 deletions(-)

diff --git a/python/cuml/naive_bayes/naive_bayes.py b/python/cuml/naive_bayes/naive_bayes.py
index 78cdaba02b..cd6d883416 100644
--- a/python/cuml/naive_bayes/naive_bayes.py
+++ b/python/cuml/naive_bayes/naive_bayes.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -106,6 +106,20 @@ def count_features_dense_kernel(float_dtype, int_dtype):
                                "count_features_dense")
 
 
+def _convert_x_sparse(X):
+    X = X.tocoo()
+
+    if X.dtype not in [cp.float32, cp.float64]:
+        raise ValueError("Only floating-point dtypes (float32 or "
+                         "float64) are supported for sparse inputs.")
+
+    rows = cp.asarray(X.row, dtype=X.row.dtype)
+    cols = cp.asarray(X.col, dtype=X.col.dtype)
+    data = cp.asarray(X.data, dtype=X.data.dtype)
+    return cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
+                                         shape=X.shape)
+
+
 class MultinomialNB(Base, ClassifierMixin):
     """
     Naive Bayes classifier for multinomial models
@@ -230,19 +244,21 @@ def __init__(self,
 
     @generate_docstring(X='dense_sparse')
     @cp.prof.TimeRangeDecorator(message="fit()", color_id=0)
-    def fit(self, X, y, sample_weight=None) -> "MultinomialNB":
+    def fit(self, X, y,
+            sample_weight=None, convert_dtype=True) -> "MultinomialNB":
         """
         Fit Naive Bayes classifier according to X, y
-
         """
-        return self.partial_fit(X, y, sample_weight)
+        return self.partial_fit(X, y, sample_weight,
+                                convert_dtype=convert_dtype)
 
     @cp.prof.TimeRangeDecorator(message="fit()", color_id=0)
     def _partial_fit(self,
                      X,
                      y,
                      sample_weight=None,
-                     _classes=None) -> "MultinomialNB":
+                     _classes=None,
+                     convert_dtype=True) -> "MultinomialNB":
 
         if has_scipy():
             from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
@@ -253,23 +269,33 @@ def _partial_fit(self,
         # todo: use a sparse CumlArray style approach when ready
         # https://github.com/rapidsai/cuml/issues/2216
         if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
-            X = X.tocoo()
-            rows = cp.asarray(X.row, dtype=X.row.dtype)
-            cols = cp.asarray(X.col, dtype=X.col.dtype)
-            data = cp.asarray(X.data, dtype=X.data.dtype)
-            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
-                                              shape=X.shape)
+            X = _convert_x_sparse(X)
+            # TODO: Expanded this since sparse kernel doesn't
+            # actually require the scipy sparse container format.
         else:
-            X = input_to_cupy_array(X, order='K').array
-
-        y = input_to_cupy_array(y).array
+            X = input_to_cupy_array(X, order='K',
+                                    check_dtype=[cp.float32, cp.float64,
+                                                 cp.int32]).array
+
+        expected_y_dtype = cp.int32 if X.dtype in [cp.float32,
+                                                   cp.int32] else cp.int64
+        y = input_to_cupy_array(y,
+                                convert_to_dtype=(expected_y_dtype
+                                                  if convert_dtype
+                                                  else False),
+                                check_dtype=expected_y_dtype).array
 
         Y, label_classes = make_monotonic(y, copy=True)
 
         if not self.fit_called_:
             self.fit_called_ = True
             if _classes is not None:
-                _classes, *_ = input_to_cuml_array(_classes, order='K')
+                _classes, *_ = input_to_cuml_array(_classes,
+                                                   order='K',
+                                                   convert_to_dtype=(
+                                                       expected_y_dtype
+                                                       if convert_dtype
+                                                       else False))
                 check_labels(Y, _classes)
                 self.classes_ = _classes
             else:
@@ -281,7 +307,10 @@ def _partial_fit(self,
         else:
             check_labels(Y, self.classes_)
 
-        self._count(X, Y)
+        if cp.sparse.isspmatrix(X):
+            self._count_sparse(X.row, X.col, X.data, X.shape, Y)
+        else:
+            self._count(X, Y)
 
         self._update_feature_log_prob(self.alpha)
         self._update_class_log_prior(class_prior=self._class_prior_)
@@ -302,7 +331,8 @@ def partial_fit(self,
                     X,
                     y,
                     classes=None,
-                    sample_weight=None) -> "MultinomialNB":
+                    sample_weight=None,
+                    convert_dtype=True) -> "MultinomialNB":
         """
         Incremental fit on a batch of samples.
 
@@ -324,15 +354,20 @@ def partial_fit(self,
             Training vectors, where n_samples is the number of samples and
             n_features is the number of features
 
-        y : array-like of shape (n_samples) Target values.
+        y : array-like of int32 or int64, shape (n_samples)
+            Target values.
+
         classes : array-like of shape (n_classes)
-                  List of all the classes that can possibly appear in the y
-                  vector. Must be provided at the first call to partial_fit,
-                  can be omitted in subsequent calls.
+            List of all the classes that can possibly appear in the y
+            vector. Must be provided at the first call to partial_fit,
+            can be omitted in subsequent calls.
 
         sample_weight : array-like of shape (n_samples)
-                        Weights applied to individual samples (1. for
-                        unweighted). Currently sample weight is ignored
+            Weights applied to individual samples (1. for
+            unweighted). Currently sample weight is ignored
+
+        convert_dtype : bool
+            If True, convert y to the appropriate dtype (int)
 
         Returns
         -------
@@ -366,14 +401,11 @@ def predict(self, X) -> CumlArray:
         # todo: use a sparse CumlArray style approach when ready
         # https://github.com/rapidsai/cuml/issues/2216
         if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
-            X = X.tocoo()
-            rows = cp.asarray(X.row, dtype=X.row.dtype)
-            cols = cp.asarray(X.col, dtype=X.col.dtype)
-            data = cp.asarray(X.data, dtype=X.data.dtype)
-            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
-                                              shape=X.shape)
+            X = _convert_x_sparse(X)
         else:
-            X = input_to_cupy_array(X, order='K').array
+            X = input_to_cupy_array(X, order='K',
+                                    check_dtype=[cp.float32, cp.float64,
+                                                 cp.int32]).array
 
         jll = self._joint_log_likelihood(X)
         indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)
@@ -406,14 +438,12 @@ def predict_log_proba(self, X) -> CumlArray:
         # todo: use a sparse CumlArray style approach when ready
         # https://github.com/rapidsai/cuml/issues/2216
         if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
-            X = X.tocoo()
-            rows = cp.asarray(X.row, dtype=X.row.dtype)
-            cols = cp.asarray(X.col, dtype=X.col.dtype)
-            data = cp.asarray(X.data, dtype=X.data.dtype)
-            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
-                                              shape=X.shape)
+            X = _convert_x_sparse(X)
         else:
-            X = input_to_cupy_array(X, order='K').array
+            X = input_to_cupy_array(X, order='K',
+                                    check_dtype=[cp.float32,
+                                                 cp.float64,
+                                                 cp.int32]).array
 
         jll = self._joint_log_likelihood(X)
 
@@ -470,7 +500,7 @@ def _count(self, X, Y):
         Parameters
         ----------
         X : cupy.ndarray or cupyx.scipy.sparse matrix of size
-                  (n_rows, n_features)
+            (n_rows, n_features)
         Y : cupy.array of monotonic class labels
         """
 
@@ -493,43 +523,80 @@ def _count(self, X, Y):
         n_rows = X.shape[0]
         n_cols = X.shape[1]
 
+        tpb = 32
         labels_dtype = self.classes_.dtype
 
-        if cupyx.scipy.sparse.isspmatrix(X):
-            X = X.tocoo()
-
-            count_features_coo = count_features_coo_kernel(
-                X.dtype, labels_dtype)
-            count_features_coo((math.ceil(X.nnz / 32), ), (32, ),
-                               (counts,
-                                X.row,
-                                X.col,
-                                X.data,
-                                X.nnz,
-                                n_rows,
-                                n_cols,
-                                Y,
-                                self._n_classes_,
-                                False))
+        count_features_dense = count_features_dense_kernel(
+            X.dtype, labels_dtype)
+        count_features_dense(
+            (math.ceil(n_rows / tpb), math.ceil(n_cols / tpb), 1),
+            (tpb, tpb, 1),
+            (counts,
+             X,
+             n_rows,
+             n_cols,
+             Y,
+             self._n_classes_,
+             False,
+             X.flags["C_CONTIGUOUS"]))
+
+        tpb = 256
+        count_classes = count_classes_kernel(X.dtype, labels_dtype)
+        count_classes((math.ceil(n_rows / tpb), ), (tpb, ),
+                      (class_c, n_rows, Y))
 
-        else:
+        self.feature_count_ = self.feature_count_ + counts
+        self.class_count_ = self.class_count_ + class_c
 
-            count_features_dense = count_features_dense_kernel(
-                X.dtype, labels_dtype)
-            count_features_dense(
-                (math.ceil(n_rows / 32), math.ceil(n_cols / 32), 1),
-                (32, 32, 1),
-                (counts,
-                 X,
-                 n_rows,
-                 n_cols,
-                 Y,
-                 self._n_classes_,
-                 False,
-                 X.flags["C_CONTIGUOUS"]))
+    def _count_sparse(self, x_coo_rows, x_coo_cols, x_coo_data, x_shape, Y):
+        """
+        Sum feature counts & class prior counts and add to current model.
 
-        count_classes = count_classes_kernel(X.dtype, labels_dtype)
-        count_classes((math.ceil(n_rows / 32), ), (32, ), (class_c, n_rows, Y))
+        Parameters
+        ----------
+        x_coo_rows : cupy.ndarray of size (nnz)
+        x_coo_cols : cupy.ndarray of size (nnz)
+        x_coo_data : cupy.ndarray of size (nnz)
+        Y : cupy.array of monotonic class labels
+        """
+
+        if Y.dtype != self.classes_.dtype:
+            warnings.warn("Y dtype does not match classes_ dtype. Y will be "
+                          "converted, which will increase memory consumption")
+
+        # Make sure Y is a cupy array, not CumlArray
+        Y = cp.asarray(Y)
+
+        counts = cp.zeros((self._n_classes_, self._n_features_),
+                          order="F",
+                          dtype=x_coo_data.dtype)
+
+        class_c = cp.zeros(self._n_classes_, order="F", dtype=x_coo_data.dtype)
+
+        n_rows = x_shape[0]
+        n_cols = x_shape[1]
+
+        tpb = 256
+
+        labels_dtype = self.classes_.dtype
+
+        count_features_coo = count_features_coo_kernel(
+            x_coo_data.dtype, labels_dtype)
+        count_features_coo((math.ceil(x_coo_rows.shape[0] / tpb), ), (tpb, ),
+                           (counts,
+                            x_coo_rows,
+                            x_coo_cols,
+                            x_coo_data,
+                            x_coo_rows.shape[0],
+                            n_rows,
+                            n_cols,
+                            Y,
+                            self._n_classes_,
+                            False))
+
+        count_classes = count_classes_kernel(x_coo_data.dtype, labels_dtype)
+        count_classes((math.ceil(n_rows / tpb), ), (tpb, ),
+                      (class_c, n_rows, Y))
 
         self.feature_count_ = self.feature_count_ + counts
         self.class_count_ = self.class_count_ + class_c
@@ -578,7 +645,6 @@ def _joint_log_likelihood(self, X):
 
         X : array-like of size (n_samples, n_features)
         """
-
         ret = X.dot(self.feature_log_prob_.T)
         ret += self.class_log_prior_
         return ret
diff --git a/python/cuml/test/dask/test_naive_bayes.py b/python/cuml/test/dask/test_naive_bayes.py
index 4064e0f060..a67ff980a1 100644
--- a/python/cuml/test/dask/test_naive_bayes.py
+++ b/python/cuml/test/dask/test_naive_bayes.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 import cupy as cp
 import dask.array
 
+import pytest
+
 from cuml.dask.naive_bayes import MultinomialNB
 from cuml.naive_bayes.naive_bayes import MultinomialNB as SGNB
 from cuml.test.dask.utils import load_text_corpus
@@ -76,12 +78,14 @@ def test_score(client):
     assert(accuracy_score(y_hat_local.get(), y_local) == score)
 
 
-def test_model_multiple_chunks(client):
+@pytest.mark.parametrize('dtype', [cp.float32, cp.float64,
+                                   cp.int32])
+def test_model_multiple_chunks(client, dtype):
     # tests naive_bayes with n_chunks being greater than one, related to issue
     # https://github.com/rapidsai/cuml/issues/3150
     X = cp.array([[0, 0, 0, 1], [1, 0, 0, 1], [1, 0, 0, 0]])
 
-    X = dask.array.from_array(X, chunks=((1, 1, 1), -1)).astype(cp.int32)
+    X = dask.array.from_array(X, chunks=((1, 1, 1), -1)).astype(dtype)
     y = dask.array.from_array([1, 0, 0], asarray=False,
                               fancy=False, chunks=(1)).astype(cp.int32)
 
diff --git a/python/cuml/test/test_naive_bayes.py b/python/cuml/test/test_naive_bayes.py
index de440d01bd..27f03e23cf 100644
--- a/python/cuml/test/test_naive_bayes.py
+++ b/python/cuml/test/test_naive_bayes.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -60,7 +60,31 @@ def test_basic_fit_predict_sparse(x_dtype, y_dtype, nlp_20news):
     assert accuracy_score(y, y_hat) >= 0.924
 
 
-@pytest.mark.parametrize("x_dtype", [cp.float32, cp.float64])
+@pytest.mark.parametrize("x_dtype", [cp.int32, cp.int64])
+@pytest.mark.parametrize("y_dtype", [cp.int32, cp.int64])
+def test_sparse_integral_dtype_fails(x_dtype, y_dtype, nlp_20news):
+    X, y = nlp_20news
+
+    X = X.astype(x_dtype)
+    y = y.astype(y_dtype)
+
+    # Priming it seems to lower the end-to-end runtime
+    model = MultinomialNB()
+
+    with pytest.raises(ValueError):
+        model.fit(X, y)
+
+    X = X.astype(cp.float32)
+    model.fit(X, y)
+
+    X = X.astype(x_dtype)
+
+    with pytest.raises(ValueError):
+        model.predict(X)
+
+
+@pytest.mark.parametrize("x_dtype", [cp.float32, cp.float64,
+                                     cp.int32])
 @pytest.mark.parametrize("y_dtype", [cp.int32, cp.int64])
 def test_basic_fit_predict_dense_numpy(x_dtype, y_dtype, nlp_20news):
     """
@@ -68,14 +92,14 @@ def test_basic_fit_predict_dense_numpy(x_dtype, y_dtype, nlp_20news):
     """
     X, y = nlp_20news
 
-    X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype)
+    X = sparse_scipy_to_cp(X, cp.float32)
     y = y.astype(y_dtype)
 
     X = X.tocsr()[0:500].todense()
     y = y[:500]
 
     model = MultinomialNB()
-    model.fit(np.ascontiguousarray(cp.asnumpy(X)), y)
+    model.fit(np.ascontiguousarray(cp.asnumpy(X).astype(x_dtype)), y)
 
     y_hat = model.predict(X)
 
@@ -86,7 +110,8 @@ def test_basic_fit_predict_dense_numpy(x_dtype, y_dtype, nlp_20news):
 
 
 @pytest.mark.parametrize("x_dtype", [cp.float32, cp.float64])
-@pytest.mark.parametrize("y_dtype", [cp.int32, cp.int64])
+@pytest.mark.parametrize("y_dtype", [cp.int32,
+                                     cp.float32, cp.float64])
 def test_partial_fit(x_dtype, y_dtype, nlp_20news):
     chunk_size = 500
 

From bb1f8dd6f7a3987664f5d0eec6e9f9cada0bd9e9 Mon Sep 17 00:00:00 2001
From: John Zedlewski <904524+JohnZed@users.noreply.github.com>
Date: Thu, 4 Feb 2021 12:55:01 -0800
Subject: [PATCH 16/29] Speed up knn tests (#3411)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In CI, the test_nearest_neighbors tests can take up to 21 min - about 5x longer than ﻿any other test.
They cover a lot of functionality, so some time is expected, but there is also a lot of redundancy in parameters.

This change reduces the combinatorial explosion significantly in tests and removes a number of superfluous
"quality" test options we rarely use that add thousands of always-skipped tests.

Locally, this reduces test time from 278s to 50s.

Hopefully doesn't cause *too* many conflicts with @lowener's other knn test improvements.

Authors:
  - John Zedlewski (@JohnZed)

Approvers:
  - Victor Lafargue (@viclafargue)
  - Micka (@lowener)
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/cuml/pull/3411
---
 python/cuml/test/test_nearest_neighbors.py | 102 ++++++++++-----------
 1 file changed, 50 insertions(+), 52 deletions(-)

diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index bb74cc4983..b0cf1f6e7a 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -141,12 +141,18 @@ def test_self_neighboring(datatype, metric_p, nrows):
     )
 
 
-@pytest.mark.parametrize("datatype", ["dataframe", "numpy"])
-@pytest.mark.parametrize("nrows", [500, 1000, 10000])
-@pytest.mark.parametrize("ncols", [128, 1024])
-@pytest.mark.parametrize("n_neighbors", [10, 50])
-@pytest.mark.parametrize("n_clusters", [2, 10])
-@pytest.mark.parametrize("algo", ["brute", "ivfflat", "ivfpq", "ivfsq"])
+@pytest.mark.parametrize("nrows,ncols,n_neighbors,n_clusters",
+                         [(500, 128, 10, 2),
+                          (4301, 128, 10, 2),
+                          (1000, 128, 50, 2),
+                          (2233, 1024, 2, 10),
+                          stress_param(10000, 1024, 50, 10),
+                          ])
+@pytest.mark.parametrize("algo,datatype",
+                         [("brute", "dataframe"),
+                          ("ivfflat", "numpy"),
+                          ("ivfpq", "dataframe"),
+                          ("ivfsq", "numpy")])
 def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                   datatype, algo):
     if algo == "ivfpq":
@@ -181,10 +187,11 @@ def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
     assert array_equal(labels, y)
 
 
-@pytest.mark.parametrize("nlist", [4, 8])
-@pytest.mark.parametrize("nrows", [10000])
-@pytest.mark.parametrize("ncols", [128, 512])
-@pytest.mark.parametrize("n_neighbors", [8, 16])
+@pytest.mark.parametrize("nlist,nrows,ncols,n_neighbors", [
+    (4, 10000, 128, 8),
+    (8, 100, 512, 8),
+    (8, 10000, 512, 16),
+    ])
 def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist):
     algo_params = {
         'nlist': nlist,
@@ -241,13 +248,11 @@ def test_ivfpq_pred(nrows, ncols, n_neighbors,
     assert array_equal(labels, y)
 
 
-@pytest.mark.parametrize("nlist", [4])
-@pytest.mark.parametrize("qtype", ['QT_4bit', 'QT_8bit', 'QT_fp16'])
-@pytest.mark.parametrize("encodeResidual", [False, True])
-@pytest.mark.parametrize("nrows", [10000])
-@pytest.mark.parametrize("ncols", [128, 512])
-@pytest.mark.parametrize("n_neighbors", [8])
-def test_ivfsq_pred(nrows, ncols, n_neighbors, nlist, qtype, encodeResidual):
+@pytest.mark.parametrize("qtype,encodeResidual,nrows,ncols,n_neighbors,nlist",
+                         [('QT_4bit', False, 10000, 128, 8, 4),
+                          ('QT_8bit', True, 1000, 512, 7, 4),
+                          ('QT_fp16', False, 3000, 301, 5, 8)])
+def test_ivfsq_pred(qtype, encodeResidual, nrows, ncols, n_neighbors, nlist):
     algo_params = {
         'nlist': nlist,
         'nprobe': nlist * 0.25,
@@ -294,10 +299,8 @@ def test_return_dists():
 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
 @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                          stress_param(70000)])
-@pytest.mark.parametrize('n_feats', [unit_param(3), quality_param(100),
-                         stress_param(1000)])
-@pytest.mark.parametrize('k', [unit_param(3), quality_param(30),
-                         stress_param(50)])
+@pytest.mark.parametrize('n_feats', [unit_param(3), stress_param(1000)])
+@pytest.mark.parametrize('k', [unit_param(3), stress_param(50)])
 @pytest.mark.parametrize("metric", valid_metrics())
 def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
     X, _ = make_blobs(n_samples=nrows,
@@ -350,12 +353,9 @@ def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
 
 
 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
-@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
-                         stress_param(70000)])
-@pytest.mark.parametrize('n_feats', [unit_param(3), quality_param(100),
-                         stress_param(1000)])
-@pytest.mark.parametrize('k', [unit_param(3), quality_param(30),
-                         stress_param(50)])
+@pytest.mark.parametrize('nrows', [unit_param(500), stress_param(70000)])
+@pytest.mark.parametrize('n_feats', [unit_param(3), stress_param(1000)])
+@pytest.mark.parametrize('k', [unit_param(3), stress_param(50)])
 @pytest.mark.parametrize("metric", valid_metrics())
 def test_knn_x_none(input_type, nrows, n_feats, k, metric):
     X, _ = make_blobs(n_samples=nrows,
@@ -410,10 +410,8 @@ def test_knn_fit_twice():
 
 
 @pytest.mark.parametrize('input_type', ['ndarray'])
-@pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
-                         stress_param(70000)])
-@pytest.mark.parametrize('n_feats', [unit_param(20), quality_param(100),
-                         stress_param(1000)])
+@pytest.mark.parametrize('nrows', [unit_param(500), stress_param(70000)])
+@pytest.mark.parametrize('n_feats', [unit_param(20), stress_param(1000)])
 def test_nn_downcast_fails(input_type, nrows, n_feats):
     from sklearn.datasets import make_blobs as skmb
 
@@ -436,20 +434,19 @@ def test_nn_downcast_fails(input_type, nrows, n_feats):
         knn_cu.fit(X, convert_dtype=False)
 
 
-@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
-@pytest.mark.parametrize('nrows', [unit_param(10), quality_param(100),
-                         stress_param(1000)])
-@pytest.mark.parametrize('n_feats', [unit_param(5), quality_param(30),
-                         stress_param(100)])
+@pytest.mark.parametrize("input_type,mode,output_type,as_instance", [
+    ("dataframe", "connectivity", "cupy", True),
+    ("dataframe", "distance", "numpy", True),
+    ("ndarray", "connectivity", "cupy", False),
+    ("ndarray", "distance", "numpy", False),
+    ])
+@pytest.mark.parametrize('nrows', [unit_param(10), stress_param(1000)])
+@pytest.mark.parametrize('n_feats', [unit_param(5), stress_param(100)])
 @pytest.mark.parametrize("p", [2, 5])
-@pytest.mark.parametrize('k', [unit_param(3), quality_param(10),
-                         stress_param(30)])
+@pytest.mark.parametrize('k', [unit_param(3), stress_param(30)])
 @pytest.mark.parametrize("metric", valid_metrics())
-@pytest.mark.parametrize("mode", ['connectivity', 'distance'])
-@pytest.mark.parametrize("output_type", ['cupy', 'numpy'])
-@pytest.mark.parametrize("as_instance", [True, False])
-def test_knn_graph(input_type, nrows, n_feats, p, k, metric, mode,
-                   output_type, as_instance):
+def test_knn_graph(input_type, mode, output_type, as_instance,
+                   nrows, n_feats, p, k, metric):
     X, _ = make_blobs(n_samples=nrows,
                       n_features=n_feats, random_state=0)
 
@@ -487,18 +484,19 @@ def test_knn_graph(input_type, nrows, n_feats, p, k, metric, mode,
 
 
 @pytest.mark.parametrize("metric", valid_metrics_sparse())
-@pytest.mark.parametrize('shape', [(100, 100, 0.4), (100, 15000, 0.04)])
-@pytest.mark.parametrize('n_neighbors', [4])
-@pytest.mark.parametrize('batch_size_index', [40000])
-@pytest.mark.parametrize('batch_size_query', [40000])
-def test_nearest_neighbors_sparse(shape,
-                                  metric,
+@pytest.mark.parametrize(
+    'nrows,ncols,density,n_neighbors,batch_size_index,batch_size_query',
+    [(1, 10, 0.8, 1, 10, 10),
+     (10, 35, 0.8, 4, 10, 20000),
+     (40, 35, 0.5, 4, 20000, 10),
+     (35, 35, 0.8, 4, 20000, 20000)])
+def test_nearest_neighbors_sparse(metric,
+                                  nrows,
+                                  ncols,
+                                  density,
                                   n_neighbors,
                                   batch_size_index,
                                   batch_size_query):
-
-    nrows, ncols, density = shape
-
     if nrows == 1 and n_neighbors > 1:
         return
 

From 095509646cb4d0ea31146722eb27d9c9fe285d82 Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Thu, 4 Feb 2021 22:29:47 +0100
Subject: [PATCH 17/29] Fix for default arguments of PCA (#3320)

When using default arguments on PCA I faced an error on the following line `PCA().fit(X).transform(X)`.

`transform` was failing because n_components was `None` so I introduced `n_components_` to store the number of components used by the fit function without overwriting the user' `n_components`.

I added some tests on this feature too.

Authors:
  - Micka (@lowener)

Approvers:
  - Dante Gama Dessavre (@dantegd)

URL: https://github.com/rapidsai/cuml/pull/3320
---
 python/cuml/decomposition/pca.pyx | 38 +++++++++++++++++--------------
 python/cuml/test/test_pca.py      | 32 ++++++++++++++++++--------
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx
index 71e2f7ee3f..2aa672108e 100644
--- a/python/cuml/decomposition/pca.pyx
+++ b/python/cuml/decomposition/pca.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -342,15 +342,7 @@ class PCA(Base):
 
     def _build_params(self, n_rows, n_cols):
         cpdef paramsPCA *params = new paramsPCA()
-        if self.n_components is None:
-            logger.warn(
-                'Warning(`_build_params`): As of v0.16, PCA invoked without an'
-                ' n_components argument defauts to using'
-                ' min(n_samples, n_features) rather than 1'
-            )
-            params.n_components = min(n_rows, n_cols)
-        else:
-            params.n_components = self.n_components
+        params.n_components = self._n_components
         params.n_rows = n_rows
         params.n_cols = n_cols
         params.whiten = self.whiten
@@ -399,22 +391,22 @@ class PCA(Base):
 
         self.components_ = cp.flip(self.components_, axis=1)
 
-        self.components_ = self.components_.T[:self.n_components, :]
+        self.components_ = self.components_.T[:self._n_components, :]
 
         self.explained_variance_ratio_ = self.explained_variance_ / cp.sum(
             self.explained_variance_)
 
-        if self.n_components < min(self.n_rows, self.n_cols):
+        if self._n_components < min(self.n_rows, self.n_cols):
             self.noise_variance_ = \
-                self.explained_variance_[self.n_components:].mean()
+                self.explained_variance_[self._n_components:].mean()
         else:
             self.noise_variance_ = cp.array([0.0])
 
         self.explained_variance_ = \
-            self.explained_variance_[:self.n_components]
+            self.explained_variance_[:self._n_components]
 
         self.explained_variance_ratio_ = \
-            self.explained_variance_ratio_[:self.n_components]
+            self.explained_variance_ratio_[:self._n_components]
 
         # Truncating negative explained variance values to 0
         self.singular_values_ = \
@@ -431,6 +423,18 @@ class PCA(Base):
         Fit the model with X. y is currently ignored.
 
         """
+        if self.n_components is None:
+            logger.warn(
+                'Warning(`fit`): As of v0.16, PCA invoked without an'
+                ' n_components argument defauts to using'
+                ' min(n_samples, n_features) rather than 1'
+            )
+            n_rows = X.shape[0]
+            n_cols = X.shape[1]
+            self._n_components = min(n_rows, n_cols)
+        else:
+            self._n_components = self.n_components
+
         if cupyx.scipy.sparse.issparse(X):
             return self._sparse_fit(X)
         elif scipy.sparse.issparse(X):
@@ -583,7 +587,7 @@ class PCA(Base):
 
         # todo: check n_cols and dtype
         cpdef paramsPCA params
-        params.n_components = self.n_components
+        params.n_components = self._n_components
         params.n_rows = n_rows
         params.n_cols = self.n_cols
         params.whiten = self.whiten
@@ -677,7 +681,7 @@ class PCA(Base):
 
         # todo: check dtype
         cpdef paramsPCA params
-        params.n_components = self.n_components
+        params.n_components = self._n_components
         params.n_rows = n_rows
         params.n_cols = n_cols
         params.whiten = self.whiten
diff --git a/python/cuml/test/test_pca.py b/python/cuml/test/test_pca.py
index 991d50ce82..997285e605 100644
--- a/python/cuml/test/test_pca.py
+++ b/python/cuml/test/test_pca.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -73,21 +73,33 @@ def test_pca_fit(datatype, input_type, name, use_handle):
 
 @pytest.mark.parametrize('n_samples', [200])
 @pytest.mark.parametrize('n_features', [100, 300])
-def test_pca_defaults(n_samples, n_features):
-    X, Y = make_multilabel_classification(n_samples=n_samples,
-                                          n_features=n_features,
-                                          n_classes=2,
-                                          n_labels=1,
-                                          random_state=1)
-    skpca = skPCA()
-    skpca.fit(X)
-
+@pytest.mark.parametrize('sparse', [True, False])
+def test_pca_defaults(n_samples, n_features, sparse):
+    if sparse:
+        X = cupyx.scipy.sparse.random(n_samples, n_features,
+                                      density=0.03, dtype=cp.float32,
+                                      random_state=10)
+    else:
+        X, Y = make_multilabel_classification(n_samples=n_samples,
+                                              n_features=n_features,
+                                              n_classes=2,
+                                              n_labels=1,
+                                              random_state=1)
     cupca = cuPCA()
     cupca.fit(X)
+    curesult = cupca.transform(X)
     cupca.handle.sync()
 
+    if sparse:
+        X = X.toarray().get()
+    skpca = skPCA()
+    skpca.fit(X)
+    skresult = skpca.transform(X)
+
     assert skpca.svd_solver == cupca.svd_solver
     assert cupca.components_.shape[0] == skpca.components_.shape[0]
+    assert curesult.shape == skresult.shape
+    assert array_equal(curesult, skresult, 1e-3, with_sign=False)
 
 
 @pytest.mark.parametrize('datatype', [np.float32, np.float64])

From 39c7262f29ef18000024c8d9741a01179742be01 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Thu, 4 Feb 2021 15:35:24 -0600
Subject: [PATCH 18/29] FIX Update cupy to >= 7.8 and remove unused build.sh
 script (#3378)

closes #3366

Also removes an older build.sh script that to my knowledge we don't use anymore

cc @beckernick who pointed out we need to bump it due to using cupy.full order parameter

Authors:
  - Dante Gama Dessavre (@dantegd)

Approvers:
  - John Zedlewski (@JohnZed)
  - Dillon Cullinan (@dillon-cullinan)
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cuml/pull/3378
---
 ci/mg/build.sh                  | 96 ---------------------------------
 conda/recipes/cuml/meta.yaml    |  4 +-
 conda/recipes/libcuml/meta.yaml |  2 +-
 python/README.md                |  2 +-
 4 files changed, 4 insertions(+), 100 deletions(-)
 delete mode 100644 ci/mg/build.sh

diff --git a/ci/mg/build.sh b/ci/mg/build.sh
deleted file mode 100644
index 9f742edb52..0000000000
--- a/ci/mg/build.sh
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2019, NVIDIA CORPORATION.
-#########################################
-# cuML GPU build and test script for CI #
-#########################################
-set -e
-NUMARGS=$#
-ARGS=$*
-
-# Logger function for build status output
-function logger() {
-  echo -e "\n>>>> $@\n"
-}
-
-# Arg parsing function
-function hasArg {
-    (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
-}
-
-# Set path and build parallel level
-export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
-export PARALLEL_LEVEL=4
-export CUDA_REL=${CUDA_VERSION%.*}
-
-# Parse git describe
-cd $WORKSPACE
-export GIT_DESCRIBE_TAG=`git describe --tags`
-export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
-
-# Set home to the job's workspace
-export HOME=$WORKSPACE
-
-################################################################################
-# SETUP - Check environment
-################################################################################
-
-logger "Check environment..."
-env
-
-logger "Check GPU usage..."
-nvidia-smi
-
-logger "Activate conda env..."
-source activate gdf
-conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
-      "cupy>7.1.0,<9.0.0a0" \
-      "cudatoolkit=${CUDA_REL}" \
-      "cudf=${MINOR_VERSION}" \
-      "rmm=${MINOR_VERSION}" \
-      "libcumlprims=${MINOR_VERSION}" \
-      "lapack" \
-      "cmake==3.14.3" \
-      "umap-learn" \
-      "nccl>=2.5" \
-      "dask>=2.12.0" \
-      "distributed>=2.12.0" \
-      "dask-cudf=${MINOR_VERSION}" \
-      "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=${MINOR_VERSION}" \
-      "statsmodels" \
-      "xgboost==1.1.0dev.rapidsai0.15" \
-      "lightgbm"
-
-logger "Check versions..."
-python --version
-$CC --version
-$CXX --version
-conda list
-
-################################################################################
-# BUILD - Build libcuml++, cuML, and prims from source
-################################################################################
-
-logger "Build libcuml++..."
-$WORKSPACE/build.sh clean libcuml cuml prims bench -v
-
-################################################################################
-# TEST - Run MG GoogleTest and py.tests for libcuml++ and cuML
-################################################################################
-
-if hasArg --skip-tests; then
-    logger "Skipping Tests..."
-    exit 0
-fi
-
-logger "Check GPU usage..."
-nvidia-smi
-
-# Disabled while CI/the test become compatible
-# logger "MG GoogleTest for libcuml mg..."
-# cd $WORKSPACE/cpp/build
-# GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp_mg/" ./test/ml_mg
-
-logger "Python MG pytest for cuml..."
-cd $WORKSPACE/python
-pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v -m "mg"
diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
index f51de8f0b9..1dfee67233 100644
--- a/conda/recipes/cuml/meta.yaml
+++ b/conda/recipes/cuml/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 # Usage:
 #   conda build . -c defaults -c conda-forge -c numba -c rapidsai -c pytorch
@@ -40,8 +40,8 @@ requirements:
     - dask-cudf {{ minor_version }}
     - libcuml={{ version }}
     - libcumlprims {{ minor_version }}
+    - cupy>=7.8.0,<9.0.0a0
     - treelite=1.0.0
-    - cupy>7.1.0,<9.0.0a0
     - nccl>=2.5
     - ucx-py {{ minor_version }}
     - dask>=2.12.0
diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml
index a2d62f0072..39eab1e8a2 100644
--- a/conda/recipes/libcuml/meta.yaml
+++ b/conda/recipes/libcuml/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 # Usage:
 #   conda build . -c defaults -c conda-forge -c nvidia -c rapidsai -c pytorch
diff --git a/python/README.md b/python/README.md
index 033913c62f..d7ca39ea10 100644
--- a/python/README.md
+++ b/python/README.md
@@ -62,7 +62,7 @@ To build cuML's Python package, the following dependencies are required:
 - cudf version matching the cuML version
 - libcuml version matching the cuML version
 - libcuml={{ version }}
-- cupy>7.1.0,<9.0.0a0
+- cupy>=7.8.0,<9.0.0a0
 - joblib >=0.11
 
 Packages required for multigpu algorithms*:

From dec134aed25ae182098687904194a7259f96d31c Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Fri, 5 Feb 2021 05:08:56 +0100
Subject: [PATCH 19/29] Replacing sklearn functions with cuml in RF MNMG
 notebook (#3408)

Closes #2864.

This PR will replace `datasets.make_blobs` and `metrics.accuracy_score` from sklearn to cuml.

Authors:
  - Micka (@lowener)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3408
---
 notebooks/random_forest_mnmg_demo.ipynb | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/notebooks/random_forest_mnmg_demo.ipynb b/notebooks/random_forest_mnmg_demo.ipynb
index 0c5be5ec03..6797707b95 100755
--- a/notebooks/random_forest_mnmg_demo.ipynb
+++ b/notebooks/random_forest_mnmg_demo.ipynb
@@ -30,9 +30,10 @@
     "import cudf\n",
     "import cuml\n",
     "\n",
-    "from sklearn.metrics import accuracy_score\n",
-    "from sklearn import model_selection, datasets\n",
+    "from sklearn import model_selection\n",
     "\n",
+    "from cuml import datasets\n",
+    "from cuml.metrics import accuracy_score\n",
     "from cuml.dask.common import utils as dask_utils\n",
     "from dask.distributed import Client, wait\n",
     "from dask_cuda import LocalCUDACluster\n",
@@ -132,7 +133,7 @@
     "\n",
     "def distribute(X, y):\n",
     "    # First convert to cudf (with real data, you would likely load in cuDF format to start)\n",
-    "    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X))\n",
+    "    X_cudf = cudf.DataFrame(X)\n",
     "    y_cudf = cudf.Series(y)\n",
     "\n",
     "    # Partition with Dask\n",
@@ -169,7 +170,7 @@
     "\n",
     "# Use all avilable CPU cores\n",
     "skl_model = sklRF(max_depth=max_depth, n_estimators=n_trees, n_jobs=-1)\n",
-    "skl_model.fit(X_train, y_train)"
+    "skl_model.fit(X_train.get(), y_train.get())"
    ]
   },
   {
@@ -206,7 +207,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "skl_y_pred = skl_model.predict(X_test)\n",
+    "skl_y_pred = skl_model.predict(X_test.get())\n",
     "cuml_y_pred = cuml_model.predict(X_test_dask).compute().to_array()\n",
     "\n",
     "# Due to randomness in the algorithm, you may see slight variation in accuracies\n",

From d49dd1a2d79f03bbbb03fa96c0da9c164711ea85 Mon Sep 17 00:00:00 2001
From: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com>
Date: Thu, 4 Feb 2021 21:30:28 -0700
Subject: [PATCH 20/29] Clean Up `#include` Dependencies (#3402)

Closes #3376

This PR moves a few header files around to fix bad dependencies from `include` -> `src`. Moving forward, the only allowed `#include` direction will be:

- `src` -> `include`
- `src` -> `src_prims`
- `src_prims` -> `include`

To facilitate this, a few changes needed to be made:
1. Functions for the C-API have been separated into their own `*_api.cpp` file (some were combined with C++ files)
2. `host_buffer`, `device_buffer`, `hostAllocator` and `deviceAllocator` were moved from `src_prims` to `include`
3. `#include <common/cumlHandle.hpp>` has been removed from all C++ files.

This PR includes some additional improvements:
- Updated the `include_checker.py` script:
   - Added functionality to check for badly placed `#pragma once`
   - Added functionality to fix some of the existing warnings
   - General refactor to support more checks
- Fixed all incorrect uses of `#pragma once`
- Fixed incorrect uses of `using namespace ...` in header files outside of a namespace
- Removed some unnecessary `#include`

While this touches a lot of files, the actual number of changes is relatively small. Below is a before/after comparison of the include graphs:

**`src/include`:**
Before:
![image](https://user-images.githubusercontent.com/42954918/105561661-ab32fe00-5cd4-11eb-8850-bfeaffaba60f.png)
After:
![image](https://user-images.githubusercontent.com/42954918/105561673-b4bc6600-5cd4-11eb-8bb7-04be91f902b6.png)

**`src/src_prims`:**
Before:
![image](https://user-images.githubusercontent.com/42954918/105561616-79ba3280-5cd4-11eb-8a49-1d0c030df7c1.png)
After:
![image](https://user-images.githubusercontent.com/42954918/105561633-89397b80-5cd4-11eb-9702-27b2a809834b.png)

Authors:
  - Michael Demoret (@mdemoret-nv)

Approvers:
  - Dante Gama Dessavre (@dantegd)
  - John Zedlewski (@JohnZed)
  - Thejaswi. N. S (@teju85)

URL: https://github.com/rapidsai/cuml/pull/3402
---
 cpp/CMakeLists.txt                            |  37 ++-
 cpp/bench/CMakeLists.txt                      |  24 +-
 cpp/bench/sg/dataset.cuh                      |   3 +-
 cpp/bench/sg/dataset_ts.cuh                   |   3 +-
 cpp/bench/sg/dbscan.cu                        |   2 +-
 cpp/bench/sg/fil.cu                           |   2 +-
 cpp/cmake/Dependencies.cmake                  |   2 +-
 .../cuml/cluster}/dbscan_api.h                |   0
 .../cuml}/common/device_buffer.hpp            |   3 +-
 .../cuml}/common/host_buffer.hpp              |   2 +-
 cpp/include/cuml/cuml_api.h                   |  15 +-
 cpp/include/cuml/decomposition/pca_mg.hpp     |   5 +-
 .../cuml/decomposition/sign_flip_mg.hpp       |   4 +-
 cpp/include/cuml/decomposition/tsvd_mg.hpp    |   4 +-
 cpp/include/cuml/linear_model/glm.hpp         |   4 +-
 .../cuml/linear_model}/glm_api.h              |   3 +-
 cpp/include/cuml/linear_model/ols_mg.hpp      |   4 +-
 .../cuml/linear_model/preprocess_mg.hpp       |   4 +-
 cpp/include/cuml/linear_model/ridge_mg.hpp    |   4 +-
 cpp/include/cuml/neighbors/knn.hpp            |   1 -
 cpp/include/cuml/neighbors/knn_api.h          |  12 +-
 cpp/include/cuml/neighbors/knn_mg.hpp         |  11 +-
 cpp/include/cuml/random_projection/rproj_c.h  |  18 +-
 cpp/include/cuml/solvers/cd_mg.hpp            |   4 +-
 cpp/include/cuml/solvers/lars.hpp             |   4 +-
 cpp/include/cuml/svm/svm_api.h                |  26 +-
 cpp/include/cuml/svm/svr.hpp                  |   3 +-
 .../cuml/tsa}/holtwinters_api.h               |   8 +-
 cpp/scripts/include_checker.py                | 244 +++++++++++++-----
 cpp/src/arima/batched_arima.cu                |   5 +-
 cpp/src/arima/batched_kalman.cu               |   5 +-
 cpp/src/common/cumlHandle.cpp                 |   3 +-
 cpp/src/common/cumlHandle.hpp                 |  18 +-
 cpp/src/common/cuml_api.cpp                   |   3 +-
 cpp/src/datasets/make_arima.cu                |   3 +-
 cpp/src/datasets/make_blobs.cu                |   3 +-
 cpp/src/datasets/make_regression.cu           |   3 +-
 cpp/src/dbscan/adjgraph/algo.cuh              |   1 -
 cpp/src/dbscan/adjgraph/naive.cuh             |   3 +-
 cpp/src/dbscan/adjgraph/runner.cuh            |   1 -
 cpp/src/dbscan/corepoints/compute.cuh         |   2 +-
 cpp/src/dbscan/corepoints/exchange.cuh        |   2 +-
 cpp/src/dbscan/dbscan.cu                      |   6 +-
 cpp/src/dbscan/dbscan.cuh                     |   2 +-
 cpp/src/dbscan/dbscan_api.cpp                 |   6 +-
 cpp/src/dbscan/mergelabels/runner.cuh         |   1 -
 cpp/src/dbscan/mergelabels/tree_reduction.cuh |   1 -
 cpp/src/dbscan/runner.cuh                     |   3 +-
 cpp/src/dbscan/vertexdeg/algo.cuh             |   2 -
 cpp/src/dbscan/vertexdeg/naive.cuh            |   1 -
 cpp/src/dbscan/vertexdeg/runner.cuh           |   1 -
 .../batched-levelalgo/builder.cuh             |   3 +
 .../batched-levelalgo/builder_base.cuh        |   3 -
 cpp/src/decisiontree/decisiontree.cu          |   3 +-
 cpp/src/decisiontree/decisiontree_impl.cuh    |   2 +-
 cpp/src/decisiontree/decisiontree_impl.h      |   1 -
 cpp/src/decisiontree/memory.h                 |   7 +-
 cpp/src/fil/internal.cuh                      |   2 +-
 cpp/src/glm/glm_api.cpp                       |  30 ++-
 cpp/src/glm/ols.cuh                           |   1 -
 cpp/src/glm/ols_mg.cu                         |   1 -
 cpp/src/glm/preprocess.cuh                    |   1 -
 cpp/src/glm/preprocess_mg.cu                  |   4 +-
 cpp/src/glm/qn/simple_mat.cuh                 |   9 +-
 cpp/src/glm/ridge.cuh                         |   1 -
 cpp/src/glm/ridge_mg.cu                       |   1 -
 cpp/src/holtwinters/holtwinters_api.cpp       |   9 +-
 cpp/src/holtwinters/internal/hw_utils.cuh     |   4 +-
 cpp/src/holtwinters/runner.cuh                |   3 +-
 cpp/src/kmeans/common.cuh                     |   7 +-
 cpp/src/knn/knn.cu                            |  56 +---
 cpp/src/knn/knn_api.cpp                       |  76 ++++++
 cpp/src/knn/knn_opg_common.cuh                |   6 +-
 cpp/src/knn/knn_sparse.cu                     |   4 +-
 cpp/src/metrics/trustworthiness.cu            |  17 +-
 cpp/src/metrics/trustworthiness.cuh           |   4 +-
 cpp/src/pca/pca.cuh                           |   5 +-
 cpp/src/pca/pca_mg.cu                         |   5 +-
 cpp/src/pca/sign_flip_mg.cu                   |   5 +-
 cpp/src/random_projection/rproj.cuh           |   3 +-
 cpp/src/random_projection/rproj_utils.cuh     |   3 +-
 cpp/src/randomforest/randomforest.cu          |  28 +-
 cpp/src/solver/cd.cuh                         |   5 +-
 cpp/src/solver/cd_mg.cu                       |   5 +-
 cpp/src/solver/lars_impl.cuh                  |   7 +-
 cpp/src/solver/sgd.cuh                        |   4 +-
 cpp/src/svm/kernelcache.cuh                   |   2 -
 cpp/src/svm/results.cuh                       |   6 +-
 cpp/src/svm/smoblocksolve.cuh                 |   2 +-
 cpp/src/svm/smosolver.cuh                     |   5 +-
 cpp/src/svm/svc.cu                            |   3 +-
 cpp/src/svm/svc_impl.cuh                      |   7 +-
 cpp/src/svm/svm_api.cpp                       |   9 +-
 cpp/src/svm/svr.cu                            |   3 +-
 cpp/src/svm/svr_impl.cuh                      |   6 +-
 cpp/src/svm/workingset.cuh                    |   5 +-
 cpp/src/tsa/auto_arima.cu                     |   3 +-
 cpp/src/tsa/auto_arima.cuh                    |   4 +-
 cpp/src/tsa/stationarity.cu                   |   3 +-
 cpp/src/tsne/barnes_hut.cuh                   |   4 +-
 cpp/src/tsne/exact_tsne.cuh                   |   4 +-
 cpp/src/tsne/tsne_runner.cuh                  |   3 +-
 cpp/src/tsne/utils.cuh                        |   3 +-
 cpp/src/tsvd/tsvd.cuh                         |   5 +-
 cpp/src/tsvd/tsvd_mg.cu                       |   5 +-
 cpp/src/umap/fuzzy_simpl_set/runner.cuh       |   5 +-
 cpp/src/umap/init_embed/random_algo.cuh       |   9 +-
 cpp/src/umap/init_embed/runner.cuh            |   6 +-
 cpp/src/umap/init_embed/spectral_algo.cuh     |   4 +-
 cpp/src/umap/knn_graph/algo.cuh               |   6 +-
 cpp/src/umap/knn_graph/runner.cuh             |   8 +-
 cpp/src/umap/optimize.cuh                     |   4 +-
 cpp/src/umap/simpl_set_embed/runner.cuh       |   6 +-
 cpp/src_prims/cache/cache.cuh                 |   5 +-
 cpp/src_prims/cache/cache_util.cuh            |   1 -
 cpp/src_prims/distance/distance.cuh           |   4 +-
 cpp/src_prims/distance/fused_l2_nn.cuh        |   8 +-
 cpp/src_prims/functions/hinge.cuh             |   1 -
 cpp/src_prims/functions/linearReg.cuh         |   1 -
 cpp/src_prims/functions/logisticReg.cuh       |   1 -
 cpp/src_prims/label/classlabels.cuh           |   6 +-
 cpp/src_prims/linalg/batched/matrix.cuh       |   4 +-
 cpp/src_prims/linalg/lstsq.cuh                |   5 +-
 cpp/src_prims/linalg/rsvd.cuh                 |   4 +-
 cpp/src_prims/metrics/adjusted_rand_index.cuh |   4 +-
 .../metrics/batched/silhouette_score.cuh      |   2 +-
 cpp/src_prims/metrics/completeness_score.cuh  |   3 +-
 cpp/src_prims/metrics/dispersion.cuh          |   4 +-
 cpp/src_prims/metrics/entropy.cuh             |   4 +-
 cpp/src_prims/metrics/kl_divergence.cuh       |   4 +-
 cpp/src_prims/metrics/mutual_info_score.cuh   |   4 +-
 cpp/src_prims/metrics/pairwise_distance.cuh   | 120 ++++-----
 cpp/src_prims/metrics/rand_index.cuh          |   4 +-
 cpp/src_prims/metrics/silhouette_score.cuh    |   4 +-
 cpp/src_prims/random/make_blobs.cuh           |   4 +-
 cpp/src_prims/random/make_regression.cuh      |   4 +-
 cpp/src_prims/selection/knn.cuh               |   6 +-
 cpp/src_prims/selection/processing.cuh        |   4 +-
 cpp/src_prims/sparse/batched/csr.cuh          |   4 +-
 cpp/src_prims/sparse/convert/coo.cuh          |   6 +-
 cpp/src_prims/sparse/convert/csr.cuh          |  10 +-
 cpp/src_prims/sparse/convert/dense.cuh        |   4 +-
 cpp/src_prims/sparse/coo.cuh                  |   6 +-
 cpp/src_prims/sparse/csr.cuh                  |   2 +-
 .../sparse/distance/bin_distance.cuh          |   6 +-
 cpp/src_prims/sparse/distance/coo_spmv.cuh    |   6 +-
 cpp/src_prims/sparse/distance/csr_spmv.cuh    |  10 +-
 cpp/src_prims/sparse/distance/distance.cuh    |  22 +-
 cpp/src_prims/sparse/distance/ip_distance.cuh |  14 +-
 cpp/src_prims/sparse/distance/l2_distance.cuh |  19 +-
 cpp/src_prims/sparse/distance/lp_distance.cuh |  12 +-
 cpp/src_prims/sparse/linalg/add.cuh           |   2 +-
 cpp/src_prims/sparse/linalg/degree.cuh        |   4 +-
 cpp/src_prims/sparse/linalg/norm.cuh          |   2 +-
 cpp/src_prims/sparse/linalg/spectral.cuh      |   6 +-
 cpp/src_prims/sparse/linalg/symmetrize.cuh    |   6 +-
 cpp/src_prims/sparse/linalg/transpose.h       |   4 +-
 cpp/src_prims/sparse/op/filter.cuh            |   8 +-
 cpp/src_prims/sparse/op/row_op.cuh            |   2 +-
 cpp/src_prims/sparse/op/slice.h               |   4 +-
 cpp/src_prims/sparse/op/sort.h                |   6 +-
 cpp/src_prims/sparse/selection/knn.cuh        |  29 +--
 cpp/src_prims/sparse/selection/selection.cuh  |   8 +-
 cpp/src_prims/timeSeries/jones_transform.cuh  |   3 +-
 cpp/src_prims/timeSeries/stationarity.cuh     |   6 +-
 cpp/test/CMakeLists.txt                       |  44 +++-
 cpp/test/c_api/README.md                      |  20 ++
 cpp/test/c_api/dbscan_api_test.c              |  27 ++
 cpp/test/c_api/glm_api_test.c                 |  28 ++
 cpp/test/c_api/holtwinters_api_test.c         |  33 +++
 cpp/test/c_api/knn_api_test.c                 |  25 ++
 cpp/test/c_api/svm_api_test.c                 |  31 +++
 cpp/test/mg/knn.cu                            |   6 +-
 cpp/test/mg/knn_test_helper.cuh               |   5 +-
 cpp/test/mg/pca.cu                            |   6 +-
 cpp/test/mg/test_opg_utils.h                  |   4 +-
 cpp/test/prims/epsilon_neighborhood.cu        |   3 +-
 cpp/test/prims/gram.cu                        |   7 +-
 cpp/test/prims/hinge.cu                       |   3 +-
 cpp/test/prims/host_buffer.cu                 |   5 +-
 cpp/test/prims/linearReg.cu                   |   3 +-
 cpp/test/prims/logisticReg.cu                 |   3 +-
 cpp/test/sg/dbscan_test.cu                    |   2 +-
 cpp/test/sg/kmeans_test.cu                    |   4 +-
 cpp/test/sg/knn_test.cu                       |   4 +-
 cpp/test/sg/quasi_newton.cu                   |   3 +-
 cpp/test/sg/svc_test.cu                       |   4 +-
 cpp/test/sg/tsne_test.cu                      |   4 +-
 cpp/test/sg/umap_parametrizable_test.cu       |   4 +-
 cpp/test/sg/umap_test.cu                      |   4 +-
 wiki/cpp/DEVELOPER_GUIDE.md                   |   9 +
 191 files changed, 996 insertions(+), 703 deletions(-)
 rename cpp/{src/dbscan => include/cuml/cluster}/dbscan_api.h (100%)
 rename cpp/{src_prims => include/cuml}/common/device_buffer.hpp (93%)
 rename cpp/{src_prims => include/cuml}/common/host_buffer.hpp (96%)
 rename cpp/{src/glm => include/cuml/linear_model}/glm_api.h (95%)
 rename cpp/{src/holtwinters => include/cuml/tsa}/holtwinters_api.h (97%)
 create mode 100644 cpp/src/knn/knn_api.cpp
 create mode 100644 cpp/test/c_api/README.md
 create mode 100644 cpp/test/c_api/dbscan_api_test.c
 create mode 100644 cpp/test/c_api/glm_api_test.c
 create mode 100644 cpp/test/c_api/holtwinters_api_test.c
 create mode 100644 cpp/test/c_api/knn_api_test.c
 create mode 100644 cpp/test/c_api/svm_api_test.c

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4049c58926..18e4e93996 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -206,6 +206,9 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_BINARY_DIR}/cmake)
 # https://github.com/rapidsai/cuml/issues/2194
 find_package(CUDA 10.0 REQUIRED)
 
+# Set the -isystem flag to not have an '=' sign to allow parsing of compile_commands.json
+set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-isystem ")
+
 if (NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP})
   find_package(OpenMP)
   if(OPENMP_FOUND)
@@ -324,9 +327,6 @@ include(cmake/Dependencies.cmake)
 
 set(CUML_INCLUDE_DIRECTORIES
   ${CUML_INCLUDE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_SOURCE_DIR}/src_prims
-  ${CMAKE_CURRENT_SOURCE_DIR}/test/prims
   ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
   ${CUTLASS_DIR}/src/cutlass
   ${SPDLOG_DIR}/src/spdlog/include
@@ -334,6 +334,11 @@ set(CUML_INCLUDE_DIRECTORIES
   ${RAFT_DIR}/cpp/include
   ${RMM_INCLUDE_DIRS})
 
+# These directories will be used by dependent targets
+set(CUML_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+set(CUML_SRC_PRIMS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src_prims)
+set(CUML_TEST_PRIMS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/test/prims)
+
 if(NOT CUB_IS_PART_OF_CTK)
   list(APPEND CUML_INCLUDE_DIRECTORIES ${CUB_DIR}/src/cub)
 endif(NOT CUB_IS_PART_OF_CTK)
@@ -386,8 +391,6 @@ if(BUILD_CUML_CPP_LIBRARY)
   add_library(${CUML_CPP_TARGET} SHARED
     src/arima/batched_arima.cu
     src/arima/batched_kalman.cu
-    src/common/cumlHandle.cpp
-    src/common/cuml_api.cpp
     src/common/logger.cpp
     src/common/nvtx.cu
     src/datasets/make_arima.cu
@@ -462,8 +465,16 @@ if(BUILD_CUML_CPP_LIBRARY)
     link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
   endif(NVTX)
 
+  target_compile_definitions(${CUML_CPP_TARGET}
+    PRIVATE
+      CUML_CPP_API)
+
   target_include_directories(${CUML_CPP_TARGET}
-    PRIVATE ${CUML_INCLUDE_DIRECTORIES})
+    PUBLIC
+      ${CUML_INCLUDE_DIRECTORIES}
+    PRIVATE
+      ${CUML_SRC_DIR}
+      ${CUML_SRC_PRIMS_DIR})
 
   target_link_libraries(${CUML_CPP_TARGET}
     PUBLIC
@@ -471,6 +482,7 @@ if(BUILD_CUML_CPP_LIBRARY)
     PRIVATE
       ${CUML_PRIVATE_LINK_LIBRARIES}
   )
+
   # If we export the libdmlc symbols, they can lead to weird crashes with other
   # libraries that use libdmlc. This just hides the symbols internally.
   target_link_options(${CUML_CPP_TARGET} PRIVATE "-Wl,--exclude-libs,libdmlc.a")
@@ -486,14 +498,23 @@ if(BUILD_CUML_C_LIBRARY)
   set(CUML_C_TARGET "cuml")
 
   add_library(${CUML_C_TARGET} SHARED
+    src/common/cumlHandle.cpp
     src/common/cuml_api.cpp
     src/dbscan/dbscan_api.cpp
     src/glm/glm_api.cpp
     src/holtwinters/holtwinters_api.cpp
-    src/svm/svm_api.cpp)
+    src/knn/knn_api.cpp
+    src/svm/svm_api.cpp
+  )
+
+  target_compile_definitions(${CUML_C_TARGET}
+    PRIVATE
+      CUML_C_API)
 
   target_include_directories(${CUML_C_TARGET}
-    PRIVATE ${CUML_INCLUDE_DIRECTORIES})
+    PRIVATE 
+      ${CUML_SRC_DIR}
+  )
 
   target_link_libraries(${CUML_C_TARGET} PUBLIC ${CUML_CPP_TARGET})
 endif(BUILD_CUML_C_LIBRARY)
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 48de6fe1bd..82f45f07f5 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,8 +41,10 @@ if(BUILD_CUML_BENCH)
     benchmarklib)
 
   target_include_directories(sg_benchmark
-    PRIVATE ${GBENCH_DIR}/include
-    ${CUML_INCLUDE_DIRECTORIES})
+    PRIVATE 
+      ${GBENCH_DIR}/include
+      ${CUML_SRC_PRIMS_DIR}
+  )
 endif(BUILD_CUML_BENCH)
 
 ##############################################################################
@@ -64,17 +66,23 @@ if(BUILD_CUML_PRIMS_BENCH)
     prims/matrix_vector_op.cu
     prims/permute.cu
     prims/reduce.cu
-    prims/rng.cu
-    ../src/common/logger.cpp)  # because prims is header only!
+    prims/rng.cu)
 
   if(NOT CUB_IS_PART_OF_CTK)
     add_dependencies(prims_benchmark cub)
   endif(NOT CUB_IS_PART_OF_CTK)
   add_dependencies(prims_benchmark spdlog)
 
-  target_link_libraries(prims_benchmark ${CUDA_cublas_LIBRARY} benchmarklib)
+  target_link_libraries(prims_benchmark 
+    PUBLIC
+      ${CUDA_cublas_LIBRARY}
+      benchmarklib
+      ${CUML_CPP_TARGET}
+  )
 
   target_include_directories(prims_benchmark
-    PRIVATE ${GBENCH_DIR}/include
-    ${CUML_INCLUDE_DIRECTORIES})
+    PRIVATE 
+      ${GBENCH_DIR}/include
+      ${CUML_SRC_PRIMS_DIR}
+  )
 endif(BUILD_CUML_PRIMS_BENCH)
diff --git a/cpp/bench/sg/dataset.cuh b/cpp/bench/sg/dataset.cuh
index ce9d243a85..003dc60e45 100644
--- a/cpp/bench/sg/dataset.cuh
+++ b/cpp/bench/sg/dataset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/transpose.h>
-#include <common/cumlHandle.hpp>
 #include <cuml/cuml.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 #include <fstream>
diff --git a/cpp/bench/sg/dataset_ts.cuh b/cpp/bench/sg/dataset_ts.cuh
index b43029d22b..518acf5863 100644
--- a/cpp/bench/sg/dataset_ts.cuh
+++ b/cpp/bench/sg/dataset_ts.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include <cuml/cuml.hpp>
 #include <raft/cuda_utils.cuh>
 
diff --git a/cpp/bench/sg/dbscan.cu b/cpp/bench/sg/dbscan.cu
index 44818eb959..d38674a88e 100644
--- a/cpp/bench/sg/dbscan.cu
+++ b/cpp/bench/sg/dbscan.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cuml/cluster/dbscan.hpp>
-#include <cuml/cuml.hpp>
+
 #include <utility>
 #include "benchmark.cuh"
 
diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
index 7a80effc64..406bc4d9e0 100644
--- a/cpp/bench/sg/fil.cu
+++ b/cpp/bench/sg/fil.cu
@@ -15,8 +15,8 @@
  */
 
 #include <cuml/fil/fil.h>
+
 #include <cuml/tree/algo_helper.h>
-#include <decisiontree/decisiontree_impl.h>
 #include <treelite/c_api.h>
 #include <treelite/tree.h>
 #include <cuml/common/logger.hpp>
diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
index 007a257686..5ee25601b0 100644
--- a/cpp/cmake/Dependencies.cmake
+++ b/cpp/cmake/Dependencies.cmake
@@ -39,7 +39,7 @@ else(DEFINED ENV{RAFT_PATH})
 
   ExternalProject_Add(raft
     GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           a61cbed999ec3bb9143bbd2c6c455f946d9e1330
+    GIT_TAG           4a79adcb0c0e87964dcdc9b9122f242b5235b702
     PREFIX            ${RAFT_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
diff --git a/cpp/src/dbscan/dbscan_api.h b/cpp/include/cuml/cluster/dbscan_api.h
similarity index 100%
rename from cpp/src/dbscan/dbscan_api.h
rename to cpp/include/cuml/cluster/dbscan_api.h
diff --git a/cpp/src_prims/common/device_buffer.hpp b/cpp/include/cuml/common/device_buffer.hpp
similarity index 93%
rename from cpp/src_prims/common/device_buffer.hpp
rename to cpp/include/cuml/common/device_buffer.hpp
index 742dae398c..bad2c02701 100644
--- a/cpp/src_prims/common/device_buffer.hpp
+++ b/cpp/include/cuml/common/device_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cuml/common/cuml_allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 namespace MLCommon {
diff --git a/cpp/src_prims/common/host_buffer.hpp b/cpp/include/cuml/common/host_buffer.hpp
similarity index 96%
rename from cpp/src_prims/common/host_buffer.hpp
rename to cpp/include/cuml/common/host_buffer.hpp
index f02f69e214..78e52954a7 100644
--- a/cpp/src_prims/common/host_buffer.hpp
+++ b/cpp/include/cuml/common/host_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cuml/cuml_api.h b/cpp/include/cuml/cuml_api.h
index ab140700d5..2e75427e03 100644
--- a/cpp/include/cuml/cuml_api.h
+++ b/cpp/include/cuml/cuml_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,24 @@
 
 #include <cuda_runtime_api.h>
 
+// Block inclusion of this header when compiling libcuml++.so. If this error is
+// shown during compilation, there is an issue with how the `#include` have
+// been set up. To debug the issue, run `./build.sh cppdocs` and open the page
+// 'cpp/build/html/cuml__api_8h.html' in a browser. This will show which files
+// directly and indirectly include this file. Only files ending in '*_api' or
+// 'cumlHandle' should include this header.
+#ifdef CUML_CPP_API
+#error \
+  "This header is only for the C-API and should not be included from the C++ API."
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 typedef int cumlHandle_t;
 
-typedef enum {
+typedef enum cumlError_t {
   CUML_SUCCESS,
   CUML_ERROR_UNKNOWN,
   CUML_INVALID_HANDLE
diff --git a/cpp/include/cuml/decomposition/pca_mg.hpp b/cpp/include/cuml/decomposition/pca_mg.hpp
index 302aaf4fd1..dc358ae92d 100644
--- a/cpp/include/cuml/decomposition/pca_mg.hpp
+++ b/cpp/include/cuml/decomposition/pca_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,6 @@
 #include <opg/matrix/data.hpp>
 #include <opg/matrix/part_descriptor.hpp>
 #include "pca.hpp"
-
-#include <common/cumlHandle.hpp>
-
 namespace ML {
 
 enum class mg_solver { COV_EIG_DQ, COV_EIG_JACOBI, QR };
diff --git a/cpp/include/cuml/decomposition/sign_flip_mg.hpp b/cpp/include/cuml/decomposition/sign_flip_mg.hpp
index 2563a740cb..b51873aaca 100644
--- a/cpp/include/cuml/decomposition/sign_flip_mg.hpp
+++ b/cpp/include/cuml/decomposition/sign_flip_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include <opg/matrix/data.hpp>
 #include <opg/matrix/part_descriptor.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace PCA {
diff --git a/cpp/include/cuml/decomposition/tsvd_mg.hpp b/cpp/include/cuml/decomposition/tsvd_mg.hpp
index 5c1b4d01b6..fcf6c1f1f7 100644
--- a/cpp/include/cuml/decomposition/tsvd_mg.hpp
+++ b/cpp/include/cuml/decomposition/tsvd_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <opg/matrix/part_descriptor.hpp>
 #include "tsvd.hpp"
 
-#include <common/cumlHandle.hpp>
-
 namespace ML {
 namespace TSVD {
 namespace opg {
diff --git a/cpp/include/cuml/linear_model/glm.hpp b/cpp/include/cuml/linear_model/glm.hpp
index d0b8ab6d20..5f26b182ce 100644
--- a/cpp/include/cuml/linear_model/glm.hpp
+++ b/cpp/include/cuml/linear_model/glm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <common/cumlHandle.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace GLM {
diff --git a/cpp/src/glm/glm_api.h b/cpp/include/cuml/linear_model/glm_api.h
similarity index 95%
rename from cpp/src/glm/glm_api.h
rename to cpp/include/cuml/linear_model/glm_api.h
index c2ad9898e6..e6a7d52635 100644
--- a/cpp/src/glm/glm_api.h
+++ b/cpp/include/cuml/linear_model/glm_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cuml/cuml_api.h>
+#include <stdbool.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/cpp/include/cuml/linear_model/ols_mg.hpp b/cpp/include/cuml/linear_model/ols_mg.hpp
index 37dea89df8..e957d93439 100644
--- a/cpp/include/cuml/linear_model/ols_mg.hpp
+++ b/cpp/include/cuml/linear_model/ols_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <opg/matrix/data.hpp>
 #include <opg/matrix/part_descriptor.hpp>
 
-#include <common/cumlHandle.hpp>
-
 namespace ML {
 namespace OLS {
 namespace opg {
diff --git a/cpp/include/cuml/linear_model/preprocess_mg.hpp b/cpp/include/cuml/linear_model/preprocess_mg.hpp
index a204648b14..d8d3fce7ed 100644
--- a/cpp/include/cuml/linear_model/preprocess_mg.hpp
+++ b/cpp/include/cuml/linear_model/preprocess_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include <cuml/common/cuml_allocator.hpp>
 #include <opg/matrix/data.hpp>
 #include <opg/matrix/part_descriptor.hpp>
 #include <raft/comms/comms.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace GLM {
diff --git a/cpp/include/cuml/linear_model/ridge_mg.hpp b/cpp/include/cuml/linear_model/ridge_mg.hpp
index b5cb23a47e..766eb32362 100644
--- a/cpp/include/cuml/linear_model/ridge_mg.hpp
+++ b/cpp/include/cuml/linear_model/ridge_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <opg/matrix/part_descriptor.hpp>
 #include "glm.hpp"
 
-#include <common/cumlHandle.hpp>
-
 namespace ML {
 namespace Ridge {
 namespace opg {
diff --git a/cpp/include/cuml/neighbors/knn.hpp b/cpp/include/cuml/neighbors/knn.hpp
index 63c909010a..3effa58a6f 100644
--- a/cpp/include/cuml/neighbors/knn.hpp
+++ b/cpp/include/cuml/neighbors/knn.hpp
@@ -18,7 +18,6 @@
 
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/StandardGpuResources.h>
-#include <common/cumlHandle.hpp>
 #include <cuml/common/logger.hpp>
 #include <cuml/cuml.hpp>
 
diff --git a/cpp/include/cuml/neighbors/knn_api.h b/cpp/include/cuml/neighbors/knn_api.h
index a1ec1b20f7..05420e4742 100644
--- a/cpp/include/cuml/neighbors/knn_api.h
+++ b/cpp/include/cuml/neighbors/knn_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@
 
 #pragma once
 
-#include <cuml_api.h>
+#include <cuml/cuml_api.h>
+#include <stdbool.h>
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -48,10 +50,10 @@ extern "C" {
  * 					 form (e.g., without raising to the 1/p power).
  */
 cumlError_t knn_search(const cumlHandle_t handle, float **input, int *size,
-                       int n_params, int D, const float *search_items, int n,
+                       int n_params, int D, float *search_items, int n,
                        int64_t *res_I, float *res_D, int k, bool rowMajorIndex,
-                       bool rowMajorQuery, int metric_type = 0,
-                       float metric_arg = 2.0f, bool expanded = 0);
+                       bool rowMajorQuery, int metric_type, float metric_arg,
+                       bool expanded);
 
 #ifdef __cplusplus
 }
diff --git a/cpp/include/cuml/neighbors/knn_mg.hpp b/cpp/include/cuml/neighbors/knn_mg.hpp
index 41d83e5586..bd11e861fe 100644
--- a/cpp/include/cuml/neighbors/knn_mg.hpp
+++ b/cpp/include/cuml/neighbors/knn_mg.hpp
@@ -14,21 +14,20 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include <raft/handle.hpp>
 #include <vector>
 
 #include <opg/matrix/data.hpp>
 #include <opg/matrix/part_descriptor.hpp>
 
-#include <common/cumlHandle.hpp>
-
-#pragma once
-
-using namespace MLCommon;
-
 namespace ML {
 namespace KNN {
 namespace opg {
 
+using namespace MLCommon;
+
 /**
  * Performs a multi-node multi-GPU KNN.
  * @param[in] handle the raft::handle_t to use for managing resources
diff --git a/cpp/include/cuml/random_projection/rproj_c.h b/cpp/include/cuml/random_projection/rproj_c.h
index ddeb682918..c0da431742 100644
--- a/cpp/include/cuml/random_projection/rproj_c.h
+++ b/cpp/include/cuml/random_projection/rproj_c.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,12 @@
  */
 
 #pragma once
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 
-namespace ML {
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
+#include <raft/handle.hpp>
 
-using namespace MLCommon;
+namespace ML {
 
 /**
      * @defgroup paramsRPROJ: structure holding parameters used by random projection model
@@ -61,12 +61,12 @@ struct rand_mat {
   ~rand_mat() { this->reset(); }
 
   // For dense matrices
-  device_buffer<math_t> dense_data;
+  MLCommon::device_buffer<math_t> dense_data;
 
   // For sparse CSC matrices
-  device_buffer<int> indices;
-  device_buffer<int> indptr;
-  device_buffer<math_t> sparse_data;
+  MLCommon::device_buffer<int> indices;
+  MLCommon::device_buffer<int> indptr;
+  MLCommon::device_buffer<math_t> sparse_data;
 
   cudaStream_t stream;
 
diff --git a/cpp/include/cuml/solvers/cd_mg.hpp b/cpp/include/cuml/solvers/cd_mg.hpp
index 8ed64c924d..7c99ddee4c 100644
--- a/cpp/include/cuml/solvers/cd_mg.hpp
+++ b/cpp/include/cuml/solvers/cd_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <opg/matrix/data.hpp>
 #include <opg/matrix/part_descriptor.hpp>
 
-#include <common/cumlHandle.hpp>
-
 namespace ML {
 namespace CD {
 namespace opg {
diff --git a/cpp/include/cuml/solvers/lars.hpp b/cpp/include/cuml/solvers/lars.hpp
index d4c49c08ed..c37f96ffe0 100644
--- a/cpp/include/cuml/solvers/lars.hpp
+++ b/cpp/include/cuml/solvers/lars.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace Solver {
diff --git a/cpp/include/cuml/svm/svm_api.h b/cpp/include/cuml/svm/svm_api.h
index aa9d3208f5..dd16b3326e 100644
--- a/cpp/include/cuml/svm/svm_api.h
+++ b/cpp/include/cuml/svm/svm_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,17 @@
 
 #include <cuml/cuml_api.h>
 
-enum cumlSvmKernelType { LINEAR, POLYNOMIAL, RBF, TANH };
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef enum cumlSvmKernelType {
+  LINEAR,
+  POLYNOMIAL,
+  RBF,
+  TANH
+} cumlSvmKernelType;
+
 /**
  * @defgroup SVM C-wrapper to C++ implementation of Support Vector Machine
  *
@@ -111,17 +116,16 @@ cumlError_t cumlDpSvcFit(cumlHandle_t handle, double *input, int n_rows,
  */
 cumlError_t cumlSpSvcPredict(cumlHandle_t handle, float *input, int n_rows,
                              int n_cols, cumlSvmKernelType kernel, int degree,
-                             float gamma, float coef0, int *n_support, float *b,
-                             float **dual_coefs, float **x_support,
-                             int *n_classes, float **unique_labels,
-                             float *preds, float buffer_size,
-                             int predict_class);
+                             float gamma, float coef0, int n_support, float b,
+                             float *dual_coefs, float *x_support, int n_classes,
+                             float *unique_labels, float *preds,
+                             float buffer_size, int predict_class);
 
 cumlError_t cumlDpSvcPredict(cumlHandle_t handle, double *input, int n_rows,
                              int n_cols, cumlSvmKernelType kernel, int degree,
-                             double gamma, double coef0, int *n_support,
-                             double *b, double **dual_coefs, double **x_support,
-                             int *n_classes, double **unique_labels,
+                             double gamma, double coef0, int n_support,
+                             double b, double *dual_coefs, double *x_support,
+                             int n_classes, double *unique_labels,
                              double *preds, double buffer_size,
                              int predict_class);
 /** @} */
diff --git a/cpp/include/cuml/svm/svr.hpp b/cpp/include/cuml/svm/svr.hpp
index 8ca308acac..df842461c5 100644
--- a/cpp/include/cuml/svm/svr.hpp
+++ b/cpp/include/cuml/svm/svr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 
 #include <cublas_v2.h>
 #include <cuml/matrix/kernelparams.h>
-#include <common/cumlHandle.hpp>
 #include "svm_model.h"
 #include "svm_parameter.h"
 
diff --git a/cpp/src/holtwinters/holtwinters_api.h b/cpp/include/cuml/tsa/holtwinters_api.h
similarity index 97%
rename from cpp/src/holtwinters/holtwinters_api.h
rename to cpp/include/cuml/tsa/holtwinters_api.h
index 49803c1d75..2dec881d03 100644
--- a/cpp/src/holtwinters/holtwinters_api.h
+++ b/cpp/include/cuml/tsa/holtwinters_api.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,16 @@
 #pragma once
 
 #include <cuml/cuml_api.h>
+#include <stdio.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-enum cumlHoltWintersSeasonal_t { ADDITIVE, MULTIPLICATIVE };
+typedef enum cumlHoltWintersSeasonal_t {
+  ADDITIVE,
+  MULTIPLICATIVE
+} cumlHoltWintersSeasonal_t;
 
 /**
              * @brief Provides buffer sizes for HoltWinters algorithm
diff --git a/cpp/scripts/include_checker.py b/cpp/scripts/include_checker.py
index 26a81a7821..335d4349dd 100644
--- a/cpp/scripts/include_checker.py
+++ b/cpp/scripts/include_checker.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,32 +22,79 @@
 import io
 from functools import reduce
 import operator
-
+import dataclasses
+import typing
 
 # file names could (in theory) contain simple white-space
 IncludeRegex = re.compile(r"(\s*#include\s*)([\"<])([\S ]+)([\">])")
+PragmaRegex = re.compile(r"^ *\#pragma\s+once *$")
 
 
 def parse_args():
     argparser = argparse.ArgumentParser(
         "Checks for a consistent '#include' syntax")
-    argparser.add_argument("--regex", type=str,
+    argparser.add_argument("--regex",
+                           type=str,
                            default=r"[.](cu|cuh|h|hpp|hxx|cpp)$",
                            help="Regex string to filter in sources")
-    argparser.add_argument("--inplace", action="store_true", required=False,
-                           help="If set, perform the required changes inplace.")
-    argparser.add_argument("--top_include_dirs", required=False,
+    argparser.add_argument(
+        "--inplace",
+        action="store_true",
+        required=False,
+        help="If set, perform the required changes inplace.")
+    argparser.add_argument("--top_include_dirs",
+                           required=False,
                            default='src,src_prims',
                            help="comma-separated list of directories used as "
                            "search dirs on build and which should not be "
                            "crossed in relative includes")
-    argparser.add_argument("dirs", type=str, nargs="*",
+    argparser.add_argument("dirs",
+                           type=str,
+                           nargs="*",
                            help="List of dirs where to find sources")
     args = argparser.parse_args()
     args.regex_compiled = re.compile(args.regex)
     return args
 
 
+@dataclasses.dataclass()
+class Issue:
+    is_error: bool
+    msg: str
+    file: str
+    line: int
+    fixed_str: str = None
+    was_fixed: bool = False
+
+    def get_msg_str(self) -> str:
+        if (self.is_error and not self.was_fixed):
+            return make_error_msg(
+                self.file,
+                self.line,
+                self.msg + (". Fixed!" if self.was_fixed else ""))
+        else:
+            return make_warn_msg(
+                self.file,
+                self.line,
+                self.msg + (". Fixed!" if self.was_fixed else ""))
+
+
+def make_msg(err_or_warn: str, file: str, line: int, msg: str):
+    """
+    Formats the error message with a file and line number that can be used by
+    IDEs to quickly go to the exact line
+    """
+    return "{}: {}:{}, {}".format(err_or_warn, file, line, msg)
+
+
+def make_error_msg(file: str, line: int, msg: str):
+    return make_msg("ERROR", file, line, msg)
+
+
+def make_warn_msg(file: str, line: int, msg: str):
+    return make_msg("WARN", file, line, msg)
+
+
 def list_all_source_file(file_regex, srcdirs):
     all_files = []
     for srcdir in srcdirs:
@@ -58,62 +105,100 @@ def list_all_source_file(file_regex, srcdirs):
                     all_files.append(src)
     return all_files
 
-def rel_include_warnings(dir, src, line_num, inc_file, top_inc_dirs):
-    warn = []
+
+def rel_include_warnings(dir, src, line_num, inc_file,
+                         top_inc_dirs) -> typing.List[Issue]:
+    warn: typing.List[Issue] = []
     inc_folders = inc_file.split(os.path.sep)[:-1]
     inc_folders_alt = inc_file.split(os.path.altsep)[:-1]
+
     if len(inc_folders) != 0 and len(inc_folders_alt) != 0:
-        w = "File: %s, line %d, warning: using %s and %s as path separators"
-        warn.append(w % (src, line_num, os.path.sep, os.path.altsep))
+        w = "using %s and %s as path separators" % (os.path.sep,
+                                                    os.path.altsep)
+        warn.append(Issue(False, w, src, line_num))
+
     if len(inc_folders) == 0:
         inc_folders = inc_folders_alt
+
     abs_inc_folders = [
         os.path.abspath(os.path.join(dir, *inc_folders[:i + 1]))
-        for i in range(len(inc_folders))]
+        for i in range(len(inc_folders))
+    ]
+
     if os.path.curdir in inc_folders:
-        w = ("File: %s, line %d, warning: rel include containing "
-             "reference to current folder '%s'")
-        warn.append(w % (src, line_num, os.path.curdir))
-    if any(reduce(operator.or_,
-                  [os.path.basename(p) == f for f in top_inc_dirs])
-           for p in abs_inc_folders):
-        w = "File: %s, line %d, warning: rel include going over %s folders"
-        warn.append(w % (src, line_num,
-                         "/".join("'" + f + "'" for f in top_inc_dirs)))
-    if (len(inc_folders) >= 3 and os.path.pardir in inc_folders and
-            any(p != os.path.pardir for p in inc_folders)):
-        w = ("File: %s, line %d, warning: rel include with more than "
+        w = "rel include containing reference to current folder '{}'".format(
+            os.path.curdir)
+        warn.append(Issue(False, w, src, line_num))
+
+    if any(
+            any([os.path.basename(p) == f for f in top_inc_dirs])
+            for p in abs_inc_folders):
+
+        w = "rel include going over %s folders" % ("/".join(
+            "'" + f + "'" for f in top_inc_dirs))
+
+        warn.append(Issue(False, w, src, line_num))
+
+    if (len(inc_folders) >= 3 and os.path.pardir in inc_folders
+            and any(p != os.path.pardir for p in inc_folders)):
+
+        w = ("rel include with more than "
              "2 folders that aren't in a straight heritage line")
-        warn.append(w % (src, line_num))
+        warn.append(Issue(False, w, src, line_num))
+
     return warn
 
-def check_includes_in(src, inplace, top_inc_dirs):
-    errs = []
+
+def check_includes_in(src, inplace, top_inc_dirs) -> typing.List[Issue]:
+    issues: typing.List[Issue] = []
     dir = os.path.dirname(src)
+    found_pragma_once = False
+    include_count = 0
+
+    # Read all lines
     with io.open(src, encoding="utf-8") as file_obj:
         lines = list(enumerate(file_obj))
+
     for line_number, line in lines:
+        line_num = line_number + 1
+
         match = IncludeRegex.search(line)
         if match is None:
+            # Check to see if its a pragma once
+            if not found_pragma_once:
+                pragma_match = PragmaRegex.search(line)
+
+                if pragma_match is not None:
+                    found_pragma_once = True
+
+                    if include_count > 0:
+                        issues.append(
+                            Issue(
+                                True,
+                                "`#pragma once` must be before any `#include`",
+                                src,
+                                line_num))
             continue
+
+        include_count += 1
+
         val_type = match.group(2)  # " or <
         inc_file = match.group(3)
         full_path = os.path.join(dir, inc_file)
-        line_num = line_number + 1
+
         if val_type == "\"" and not os.path.isfile(full_path):
-            if inplace:
-                new_line, n = IncludeRegex.subn(r"\1<\3>", line)
-                assert n == 1, "inplace only handles one include match per line"
-                errs.append((line_number, new_line))
-            else:
-                errs.append("Line:%d use #include <...>" % line_num)
+            new_line, n = IncludeRegex.subn(r"\1<\3>", line)
+            assert n == 1, "inplace only handles one include match per line"
+
+            issues.append(
+                Issue(True, "use #include <...>", src, line_num, new_line))
+
         elif val_type == "<" and os.path.isfile(full_path):
-            if inplace:
-                new_line, n = IncludeRegex.subn(r'\1"\3"', line)
-                assert n == 1, "inplace only handles one include match per line"
-                errs.append((line_number, new_line))
-            else:
-                errs.append("Line:%d use #include \"...\"" % line_num)
+            new_line, n = IncludeRegex.subn(r'\1"\3"', line)
+            assert n == 1, "inplace only handles one include match per line"
+
+            issues.append(
+                Issue(True, "use #include \"...\"", src, line_num, new_line))
 
         # output warnings for some cases
         # 1. relative include containing current folder
@@ -122,10 +207,11 @@ def check_includes_in(src, inplace, top_inc_dirs):
         #    both ".." and "non-.."
         # 4. absolute include used but rel include possible without warning
         if val_type == "\"":
-            warn = rel_include_warnings(dir, src, line_num, inc_file,
-                                        top_inc_dirs)
-            for w in warn:
-                print(w)
+            issues += rel_include_warnings(dir,
+                                           src,
+                                           line_num,
+                                           inc_file,
+                                           top_inc_dirs)
         if val_type == "<":
             # try to make a relative import using the top folders
             for top_folder in top_inc_dirs:
@@ -143,43 +229,65 @@ def check_includes_in(src, inplace, top_inc_dirs):
                 if not os.path.isfile(full_inc):
                     continue
                 new_rel_inc = os.path.relpath(full_inc, full_dir)
-                warn = rel_include_warnings(
-                    dir, src, line_num, new_rel_inc, top_inc_dirs)
+                warn = rel_include_warnings(dir,
+                                            src,
+                                            line_num,
+                                            new_rel_inc,
+                                            top_inc_dirs)
                 if len(warn) == 0:
-                    w = ("File: %s, line %d, info: absolute include could "
-                         "be transformed to relative \"%s\"")
-                    print(w % (src, line_num, new_rel_inc))
+                    issues.append(
+                        Issue(
+                            False,
+                            "absolute include could be transformed to relative",
+                            src,
+                            line_num,
+                            f"#include \"{new_rel_inc}\"\n"))
+                else:
+                    issues += warn
+
+    if inplace and len(issues) > 0:
+        had_fixes = False
+
+        for issue in issues:
+            if (issue.fixed_str is not None):
+                lines[issue.line - 1] = (lines[issue.line - 1][0],
+                                         issue.fixed_str)
+                issue.was_fixed = True
+                had_fixes = True
 
-    if inplace and len(errs) > 0:
-        print("File: {}. Changing lines {}".format(
-            src, ', '.join(str(x[0]) for x in errs)))
-        for line_number, replacement in errs:
-            lines[line_number] = (line_number, replacement)
-        with io.open(src, "w", encoding="utf-8") as out_file:
-            for _, new_line in lines:
-                out_file.write(new_line)
-        errs = []
+        if (had_fixes):
+            with io.open(src, "w", encoding="utf-8") as out_file:
+                for _, new_line in lines:
+                    out_file.write(new_line)
 
-    return errs
+    return issues
 
 
 def main():
     args = parse_args()
     top_inc_dirs = args.top_include_dirs.split(',')
     all_files = list_all_source_file(args.regex_compiled, args.dirs)
-    all_errs = {}
+    all_issues: typing.List[Issue] = []
+    errs: typing.List[Issue] = []
+
     for f in all_files:
-        errs = check_includes_in(f, args.inplace, top_inc_dirs)
-        if len(errs) > 0:
-            all_errs[f] = errs
-    if len(all_errs) == 0:
+        issues = check_includes_in(f, args.inplace, top_inc_dirs)
+
+        all_issues += issues
+
+    for i in all_issues:
+        if (i.is_error and not i.was_fixed):
+            errs.append(i)
+        else:
+            print(i.get_msg_str())
+
+    if len(errs) == 0:
         print("include-check PASSED")
     else:
         print("include-check FAILED! See below for errors...")
-        for f, errs in all_errs.items():
-            print("File: %s" % f)
-            for e in errs:
-                print("  %s" % e)
+        for err in errs:
+            print(err.get_msg_str())
+
         path_parts = os.path.abspath(__file__).split(os.sep)
         print("You can run '{} --inplace' to bulk fix these errors".format(
             os.sep.join(path_parts[path_parts.index("cpp"):])))
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 6190ab1ebb..ebdf704e3a 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,8 @@
 #include <cuml/tsa/batched_kalman.hpp>
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <common/nvtx.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <metrics/batched/information_criterion.cuh>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index 932f50d07c..a255a64f4d 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,12 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <cub/cub.cuh>
 
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/cuml.hpp>
 #include <cuml/tsa/batched_kalman.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/common/cumlHandle.cpp b/cpp/src/common/cumlHandle.cpp
index c4697c14b5..2119565671 100644
--- a/cpp/src/common/cumlHandle.cpp
+++ b/cpp/src/common/cumlHandle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include "cumlHandle.hpp"
+
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
diff --git a/cpp/src/common/cumlHandle.hpp b/cpp/src/common/cumlHandle.hpp
index 864d2c8cc2..4c7f42f31b 100644
--- a/cpp/src/common/cumlHandle.hpp
+++ b/cpp/src/common/cumlHandle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,23 +16,9 @@
 
 #pragma once
 
-#include <mutex>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-#include <cusparse.h>
-
-#include <cuml/cuml.hpp>
-#include <raft/comms/comms.hpp>
-
 #include <cuml/cuml_api.h>
-#include <raft/handle.hpp>
-
 #include <cuml/common/cuml_allocator.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 
diff --git a/cpp/src/common/cuml_api.cpp b/cpp/src/common/cuml_api.cpp
index fb66ff78ab..206cb5d99e 100644
--- a/cpp/src/common/cuml_api.cpp
+++ b/cpp/src/common/cuml_api.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cuml/cuml_api.h>
+
 #include <raft/cudart_utils.h>
 #include <cuml/common/utils.hpp>
 #include <functional>
diff --git a/cpp/src/datasets/make_arima.cu b/cpp/src/datasets/make_arima.cu
index f309828d4b..baec3cde98 100644
--- a/cpp/src/datasets/make_arima.cu
+++ b/cpp/src/datasets/make_arima.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <common/cumlHandle.hpp>
 #include <cuml/datasets/make_arima.hpp>
 #include <random/make_arima.cuh>
 
diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu
index 8059d2bd89..c1f8389653 100644
--- a/cpp/src/datasets/make_blobs.cu
+++ b/cpp/src/datasets/make_blobs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <common/cumlHandle.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 #include <random/make_blobs.cuh>
 
diff --git a/cpp/src/datasets/make_regression.cu b/cpp/src/datasets/make_regression.cu
index 82b30428cf..a5c0499d6c 100644
--- a/cpp/src/datasets/make_regression.cu
+++ b/cpp/src/datasets/make_regression.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <common/cumlHandle.hpp>
 #include <cuml/datasets/make_regression.hpp>
 #include <random/make_regression.cuh>
 
diff --git a/cpp/src/dbscan/adjgraph/algo.cuh b/cpp/src/dbscan/adjgraph/algo.cuh
index 163d943377..80474863b2 100644
--- a/cpp/src/dbscan/adjgraph/algo.cuh
+++ b/cpp/src/dbscan/adjgraph/algo.cuh
@@ -19,7 +19,6 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 #include <common/allocatorAdapter.hpp>
-#include <common/cumlHandle.hpp>
 #include <raft/cuda_utils.cuh>
 #include "../common.cuh"
 #include "pack.h"
diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh
index 20d0175863..cb388afaea 100644
--- a/cpp/src/dbscan/adjgraph/naive.cuh
+++ b/cpp/src/dbscan/adjgraph/naive.cuh
@@ -17,8 +17,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/host_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include "../common.cuh"
 #include "pack.h"
diff --git a/cpp/src/dbscan/adjgraph/runner.cuh b/cpp/src/dbscan/adjgraph/runner.cuh
index c9298c7cea..a6362ba6a8 100644
--- a/cpp/src/dbscan/adjgraph/runner.cuh
+++ b/cpp/src/dbscan/adjgraph/runner.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include "algo.cuh"
 #include "naive.cuh"
 #include "pack.h"
diff --git a/cpp/src/dbscan/corepoints/compute.cuh b/cpp/src/dbscan/corepoints/compute.cuh
index 6de9e3c957..be43611900 100644
--- a/cpp/src/dbscan/corepoints/compute.cuh
+++ b/cpp/src/dbscan/corepoints/compute.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace Dbscan {
diff --git a/cpp/src/dbscan/corepoints/exchange.cuh b/cpp/src/dbscan/corepoints/exchange.cuh
index 6a4883b1a1..55ecb731a7 100644
--- a/cpp/src/dbscan/corepoints/exchange.cuh
+++ b/cpp/src/dbscan/corepoints/exchange.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace Dbscan {
diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu
index a70b31ae83..018ce0ee25 100644
--- a/cpp/src/dbscan/dbscan.cu
+++ b/cpp/src/dbscan/dbscan.cu
@@ -13,10 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cuml/cuml_api.h>
-#include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
+
 #include <cuml/cluster/dbscan.hpp>
+
+#include <raft/cudart_utils.h>
 #include "dbscan.cuh"
 
 namespace ML {
diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh
index 6c8b600e0f..0990c249f0 100644
--- a/cpp/src/dbscan/dbscan.cuh
+++ b/cpp/src/dbscan/dbscan.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <common/device_buffer.hpp>
 #include <common/nvtx.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include "runner.cuh"
 
diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp
index a7570d3da5..7138b2e009 100644
--- a/cpp/src/dbscan/dbscan_api.cpp
+++ b/cpp/src/dbscan/dbscan_api.cpp
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
-#include "dbscan_api.h"
+#include <cuml/cluster/dbscan_api.h>
+
 #include <cuml/cuml_api.h>
 #include <common/cumlHandle.hpp>
 #include <cuml/cluster/dbscan.hpp>
 
+extern "C" {
+
 cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
                             int n_cols, float eps, int min_pts, int *labels,
                             int *core_sample_indices,
@@ -68,3 +71,4 @@ cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
   }
   return status;
 }
+}
\ No newline at end of file
diff --git a/cpp/src/dbscan/mergelabels/runner.cuh b/cpp/src/dbscan/mergelabels/runner.cuh
index 1fc60144c0..51b79ee533 100644
--- a/cpp/src/dbscan/mergelabels/runner.cuh
+++ b/cpp/src/dbscan/mergelabels/runner.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include <label/merge_labels.cuh>
 
 namespace ML {
diff --git a/cpp/src/dbscan/mergelabels/tree_reduction.cuh b/cpp/src/dbscan/mergelabels/tree_reduction.cuh
index e81297bb21..82e5fb0954 100644
--- a/cpp/src/dbscan/mergelabels/tree_reduction.cuh
+++ b/cpp/src/dbscan/mergelabels/tree_reduction.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include "runner.cuh"
 
 namespace ML {
diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index 6e1cb00ce5..dc5fc694bc 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -17,9 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <common/nvtx.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <label/classlabels.cuh>
 #include <raft/cuda_utils.cuh>
 #include <sparse/csr.cuh>
diff --git a/cpp/src/dbscan/vertexdeg/algo.cuh b/cpp/src/dbscan/vertexdeg/algo.cuh
index b19df57243..5764dfb538 100644
--- a/cpp/src/dbscan/vertexdeg/algo.cuh
+++ b/cpp/src/dbscan/vertexdeg/algo.cuh
@@ -18,8 +18,6 @@
 
 #include <cuda_runtime.h>
 #include <math.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <distance/epsilon_neighborhood.cuh>
 
 #include "pack.h"
diff --git a/cpp/src/dbscan/vertexdeg/naive.cuh b/cpp/src/dbscan/vertexdeg/naive.cuh
index 04b3ddbb79..2fd0b4c24b 100644
--- a/cpp/src/dbscan/vertexdeg/naive.cuh
+++ b/cpp/src/dbscan/vertexdeg/naive.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include <raft/cuda_utils.cuh>
 #include "pack.h"
 
diff --git a/cpp/src/dbscan/vertexdeg/runner.cuh b/cpp/src/dbscan/vertexdeg/runner.cuh
index 889dedd51c..9cf81b978c 100644
--- a/cpp/src/dbscan/vertexdeg/runner.cuh
+++ b/cpp/src/dbscan/vertexdeg/runner.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include "algo.cuh"
 #include "naive.cuh"
 #include "pack.h"
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index 5100b6038a..5a7aa9ab35 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -16,6 +16,9 @@
 
 #pragma once
 
+#include <cuml/common/device_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
+
 #include "builder_base.cuh"
 
 namespace ML {
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
index 79ac2ddda5..91036b2d47 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
@@ -17,10 +17,7 @@
 #pragma once
 
 #include <cuml/tree/flatnode.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <common/grid_sync.cuh>
-#include <common/host_buffer.hpp>
 #include <cuml/tree/decisiontree.hpp>
 #include <raft/cuda_utils.cuh>
 #include "input.cuh"
diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu
index 8f3ded17da..949da895c7 100644
--- a/cpp/src/decisiontree/decisiontree.cu
+++ b/cpp/src/decisiontree/decisiontree.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include <cuml/tree/flatnode.h>
 #include <cuml/tree/decisiontree.hpp>
+
+#include <cuml/tree/flatnode.h>
 #include "decisiontree_impl.cuh"
 
 namespace ML {
diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh
index f742643471..fd9a717569 100644
--- a/cpp/src/decisiontree/decisiontree_impl.cuh
+++ b/cpp/src/decisiontree/decisiontree_impl.cuh
@@ -15,7 +15,6 @@
  */
 
 #include <cuml/tree/flatnode.h>
-#include <decisiontree/quantile/quantile.h>
 #include <raft/cudart_utils.h>
 #include <treelite/tree.h>
 #include <common/iota.cuh>
@@ -31,6 +30,7 @@
 #include "levelalgo/metric.cuh"
 #include "memory.cuh"
 #include "quantile/quantile.cuh"
+#include "quantile/quantile.h"
 #include "treelite_util.h"
 
 namespace ML {
diff --git a/cpp/src/decisiontree/decisiontree_impl.h b/cpp/src/decisiontree/decisiontree_impl.h
index 7b7b65822c..71ad8446a9 100644
--- a/cpp/src/decisiontree/decisiontree_impl.h
+++ b/cpp/src/decisiontree/decisiontree_impl.h
@@ -22,7 +22,6 @@
 #include <treelite/tree.h>
 #include <algorithm>
 #include <climits>
-#include <common/cumlHandle.hpp>
 #include <cuml/tree/decisiontree.hpp>
 #include <map>
 #include <numeric>
diff --git a/cpp/src/decisiontree/memory.h b/cpp/src/decisiontree/memory.h
index 8964937c2d..cb19dd7d14 100644
--- a/cpp/src/decisiontree/memory.h
+++ b/cpp/src/decisiontree/memory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,9 @@
 #pragma once
 #include <cuml/tree/flatnode.h>
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
-#include <common/host_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
 #include <cuml/tree/decisiontree.hpp>
 
 template <class T, class L>
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 2cd698e339..efe6c5c3cd 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-/** @file internal.h cuML-internal interface to Forest Inference Library. */
+/** @file internal.cuh cuML-internal interface to Forest Inference Library. */
 
 #pragma once
 
diff --git a/cpp/src/glm/glm_api.cpp b/cpp/src/glm/glm_api.cpp
index f99a147d8d..9043d0d297 100644
--- a/cpp/src/glm/glm_api.cpp
+++ b/cpp/src/glm/glm_api.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "glm_api.h"
+#include <cuml/linear_model/glm_api.h>
+
 #include <common/cumlHandle.hpp>
 #include <cuml/linear_model/glm.hpp>
 
-extern "C" cumlError_t cumlSpQnFit(cumlHandle_t cuml_handle, float *X, float *y,
-                                   int N, int D, int C, bool fit_intercept,
-                                   float l1, float l2, int max_iter,
-                                   float grad_tol, int linesearch_max_iter,
-                                   int lbfgs_memory, int verbosity, float *w0,
-                                   float *f, int *num_iters, bool X_col_major,
-                                   int loss_type) {
+extern "C" {
+
+cumlError_t cumlSpQnFit(cumlHandle_t cuml_handle, float *X, float *y, int N,
+                        int D, int C, bool fit_intercept, float l1, float l2,
+                        int max_iter, float grad_tol, int linesearch_max_iter,
+                        int lbfgs_memory, int verbosity, float *w0, float *f,
+                        int *num_iters, bool X_col_major, int loss_type) {
   cumlError_t status;
   raft::handle_t *handle_ptr;
   std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(cuml_handle);
@@ -48,11 +49,11 @@ extern "C" cumlError_t cumlSpQnFit(cumlHandle_t cuml_handle, float *X, float *y,
   return status;
 }
 
-extern "C" cumlError_t cumlDpQnFit(
-  cumlHandle_t cuml_handle, double *X, double *y, int N, int D, int C,
-  bool fit_intercept, double l1, double l2, int max_iter, double grad_tol,
-  int linesearch_max_iter, int lbfgs_memory, int verbosity, double *w0,
-  double *f, int *num_iters, bool X_col_major, int loss_type) {
+cumlError_t cumlDpQnFit(cumlHandle_t cuml_handle, double *X, double *y, int N,
+                        int D, int C, bool fit_intercept, double l1, double l2,
+                        int max_iter, double grad_tol, int linesearch_max_iter,
+                        int lbfgs_memory, int verbosity, double *w0, double *f,
+                        int *num_iters, bool X_col_major, int loss_type) {
   cumlError_t status;
   raft::handle_t *handle_ptr;
   std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(cuml_handle);
@@ -75,3 +76,4 @@ extern "C" cumlError_t cumlDpQnFit(
   }
   return status;
 }
+}
diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh
index 1f4768108d..f84ea38861 100644
--- a/cpp/src/glm/ols.cuh
+++ b/cpp/src/glm/ols.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/linalg/gemv.h>
-#include <common/cumlHandle.hpp>
 #include <linalg/lstsq.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/norm.cuh>
diff --git a/cpp/src/glm/ols_mg.cu b/cpp/src/glm/ols_mg.cu
index c29704a7db..4a1c2809fa 100644
--- a/cpp/src/glm/ols_mg.cu
+++ b/cpp/src/glm/ols_mg.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <common/cumlHandle.hpp>
 #include <cuml/common/cuml_allocator.hpp>
 #include <cuml/linear_model/ols_mg.hpp>
 #include <cuml/linear_model/preprocess_mg.hpp>
diff --git a/cpp/src/glm/preprocess.cuh b/cpp/src/glm/preprocess.cuh
index 5f2c4a31d5..2fa88e83b1 100644
--- a/cpp/src/glm/preprocess.cuh
+++ b/cpp/src/glm/preprocess.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/matrix/math.cuh>
diff --git a/cpp/src/glm/preprocess_mg.cu b/cpp/src/glm/preprocess_mg.cu
index e43e47176f..861c641bb6 100644
--- a/cpp/src/glm/preprocess_mg.cu
+++ b/cpp/src/glm/preprocess_mg.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
 #include <cuml/linear_model/preprocess_mg.hpp>
+
+#include <raft/cudart_utils.h>
 #include <opg/linalg/norm.hpp>
 #include <opg/matrix/math.hpp>
 #include <opg/stats/mean.hpp>
diff --git a/cpp/src/glm/qn/simple_mat.cuh b/cpp/src/glm/qn/simple_mat.cuh
index 52ec570abc..dc3d6cbc0a 100644
--- a/cpp/src/glm/qn/simple_mat.cuh
+++ b/cpp/src/glm/qn/simple_mat.cuh
@@ -20,9 +20,10 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <common/cumlHandle.hpp>
+#include <cuml/common/cuml_allocator.hpp>
 #include <linalg/ternary_op.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/norm.cuh>
@@ -304,7 +305,7 @@ struct SimpleVecOwning : SimpleVec<T> {
 
   SimpleVecOwning() = delete;
 
-  SimpleVecOwning(std::shared_ptr<deviceAllocator> allocator, int n,
+  SimpleVecOwning(std::shared_ptr<MLCommon::deviceAllocator> allocator, int n,
                   cudaStream_t stream)
     : Super(), buf(n, stream) {
     Super::reset(buf.data(), n);
@@ -324,8 +325,8 @@ struct SimpleMatOwning : SimpleMat<T> {
 
   SimpleMatOwning() = delete;
 
-  SimpleMatOwning(std::shared_ptr<deviceAllocator> allocator, int m, int n,
-                  cudaStream_t stream, STORAGE_ORDER order = COL_MAJOR)
+  SimpleMatOwning(std::shared_ptr<MLCommon::deviceAllocator> allocator, int m,
+                  int n, cudaStream_t stream, STORAGE_ORDER order = COL_MAJOR)
     : Super(order), buf(m * n, stream) {
     Super::reset(buf.data(), m, n);
   }
diff --git a/cpp/src/glm/ridge.cuh b/cpp/src/glm/ridge.cuh
index db36147632..4e1b1c9752 100644
--- a/cpp/src/glm/ridge.cuh
+++ b/cpp/src/glm/ridge.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/norm.cuh>
diff --git a/cpp/src/glm/ridge_mg.cu b/cpp/src/glm/ridge_mg.cu
index 5f0846744c..14add0a926 100644
--- a/cpp/src/glm/ridge_mg.cu
+++ b/cpp/src/glm/ridge_mg.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
 #include <cuml/linear_model/preprocess_mg.hpp>
 #include <cuml/linear_model/ridge_mg.hpp>
 #include <opg/linalg/mv_aTb.hpp>
diff --git a/cpp/src/holtwinters/holtwinters_api.cpp b/cpp/src/holtwinters/holtwinters_api.cpp
index 801d69d4cf..4bd36af311 100644
--- a/cpp/src/holtwinters/holtwinters_api.cpp
+++ b/cpp/src/holtwinters/holtwinters_api.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,13 @@
  * limitations under the License.
  */
 
-#include "holtwinters_api.h"
-#include <cuml/cuml_api.h>
+#include <cuml/tsa/holtwinters_api.h>
+
 #include <cuml/tsa/holtwinters.h>
 #include <common/cumlHandle.hpp>
 
+extern "C" {
+
 cumlError_t cumlHoltWinters_buffer_size(int n, int batch_size, int frequency,
                                         int *start_leveltrend_len,
                                         int *start_season_len,
@@ -122,3 +124,4 @@ cumlError_t cumlHoltWintersDp_forecast(cumlHandle_t handle, int n,
   }
   return status;
 }
+}
diff --git a/cpp/src/holtwinters/internal/hw_utils.cuh b/cpp/src/holtwinters/internal/hw_utils.cuh
index a163cfb7a6..c761ffb269 100644
--- a/cpp/src/holtwinters/internal/hw_utils.cuh
+++ b/cpp/src/holtwinters/internal/hw_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
 #include <iostream>
 #include <raft/linalg/eltwise.cuh>
diff --git a/cpp/src/holtwinters/runner.cuh b/cpp/src/holtwinters/runner.cuh
index bf825a3f16..081f7e053e 100644
--- a/cpp/src/holtwinters/runner.cuh
+++ b/cpp/src/holtwinters/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cuml/tsa/holtwinters_params.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/transpose.h>
+#include <cuml/common/device_buffer.hpp>
 #include "internal/hw_decompose.cuh"
 #include "internal/hw_eval.cuh"
 #include "internal/hw_forecast.cuh"
diff --git a/cpp/src/kmeans/common.cuh b/cpp/src/kmeans/common.cuh
index 398699fd74..5cc5bef8a2 100644
--- a/cpp/src/kmeans/common.cuh
+++ b/cpp/src/kmeans/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,10 +38,9 @@
 #include <ml_cuda_utils.h>
 
 #include <common/allocatorAdapter.hpp>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
-#include <common/host_buffer.hpp>
 #include <common/tensor.hpp>
+#include <cuml/common/device_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
 #include <raft/comms/comms.hpp>
 
 #include <cuml/common/logger.hpp>
diff --git a/cpp/src/knn/knn.cu b/cpp/src/knn/knn.cu
index 4055d61a82..4eafe85820 100644
--- a/cpp/src/knn/knn.cu
+++ b/cpp/src/knn/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <common/cumlHandle.hpp>
-
 #include <cuml/common/logger.hpp>
 #include <cuml/neighbors/knn.hpp>
 
@@ -107,56 +105,4 @@ void knn_class_proba(raft::handle_t &handle, std::vector<float *> &out,
                                    d_alloc, stream);
 }
 
-/**
- * @brief Flat C API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances.
- *
- * @param[in] handle the cuml handle to use
- * @param[in] input an array of pointers to the input arrays
- * @param[in] sizes an array of sizes of input arrays
- * @param[in] n_params array size of input and sizes
- * @param[in] D the dimensionality of the arrays
- * @param[in] search_items array of items to search of dimensionality D
- * @param[in] n number of rows in search_items
- * @param[out] res_I the resulting index array of size n * k
- * @param[out] res_D the resulting distance array of size n * k
- * @param[in] k the number of nearest neighbors to return
- * @param[in] rowMajorIndex is the index array in row major layout?
- * @param[in] rowMajorQuery is the query array in row major layout?
- */
-extern "C" cumlError_t knn_search(const cumlHandle_t handle, float **input,
-                                  int *sizes, int n_params, int D,
-                                  float *search_items, int n, int64_t *res_I,
-                                  float *res_D, int k, bool rowMajorIndex,
-                                  bool rowMajorQuery, int metric_type,
-                                  float metric_arg, bool expanded) {
-  cumlError_t status;
-
-  raft::handle_t *handle_ptr;
-  std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle);
-
-  std::vector<cudaStream_t> int_streams = handle_ptr->get_internal_streams();
-
-  std::vector<float *> input_vec(n_params);
-  std::vector<int> sizes_vec(n_params);
-  for (int i = 0; i < n_params; i++) {
-    input_vec.push_back(input[i]);
-    sizes_vec.push_back(sizes[i]);
-  }
-
-  if (status == CUML_SUCCESS) {
-    try {
-      MLCommon::Selection::brute_force_knn(
-        input_vec, sizes_vec, D, search_items, n, res_I, res_D, k,
-        handle_ptr->get_device_allocator(), handle_ptr->get_stream(),
-        int_streams.data(), handle_ptr->get_num_internal_streams(),
-        rowMajorIndex, rowMajorQuery, nullptr, (ML::MetricType)metric_type,
-        metric_arg, expanded);
-    } catch (...) {
-      status = CUML_ERROR_UNKNOWN;
-    }
-  }
-  return status;
-}
 };  // END NAMESPACE ML
diff --git a/cpp/src/knn/knn_api.cpp b/cpp/src/knn/knn_api.cpp
new file mode 100644
index 0000000000..7e5543b457
--- /dev/null
+++ b/cpp/src/knn/knn_api.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/neighbors/knn_api.h>
+
+#include <common/cumlHandle.hpp>
+#include <cuml/neighbors/knn.hpp>
+
+#include <vector>
+
+extern "C" {
+
+namespace ML {
+
+/**
+ * @brief Flat C API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances.
+ *
+ * @param[in] handle the cuml handle to use
+ * @param[in] input an array of pointers to the input arrays
+ * @param[in] sizes an array of sizes of input arrays
+ * @param[in] n_params array size of input and sizes
+ * @param[in] D the dimensionality of the arrays
+ * @param[in] search_items array of items to search of dimensionality D
+ * @param[in] n number of rows in search_items
+ * @param[out] res_I the resulting index array of size n * k
+ * @param[out] res_D the resulting distance array of size n * k
+ * @param[in] k the number of nearest neighbors to return
+ * @param[in] rowMajorIndex is the index array in row major layout?
+ * @param[in] rowMajorQuery is the query array in row major layout?
+ */
+cumlError_t knn_search(const cumlHandle_t handle, float **input, int *sizes,
+                       int n_params, int D, float *search_items, int n,
+                       int64_t *res_I, float *res_D, int k, bool rowMajorIndex,
+                       bool rowMajorQuery, int metric_type, float metric_arg,
+                       bool expanded) {
+  cumlError_t status;
+  raft::handle_t *handle_ptr;
+  std::tie(handle_ptr, status) = ML::handleMap.lookupHandlePointer(handle);
+
+  std::vector<cudaStream_t> int_streams = handle_ptr->get_internal_streams();
+
+  std::vector<float *> input_vec(n_params);
+  std::vector<int> sizes_vec(n_params);
+  for (int i = 0; i < n_params; i++) {
+    input_vec.push_back(input[i]);
+    sizes_vec.push_back(sizes[i]);
+  }
+
+  if (status == CUML_SUCCESS) {
+    try {
+      ML::brute_force_knn(*handle_ptr, input_vec, sizes_vec, D, search_items, n,
+                          res_I, res_D, k, rowMajorIndex, rowMajorQuery,
+                          (ML::MetricType)metric_type, metric_arg, expanded);
+    } catch (...) {
+      status = CUML_ERROR_UNKNOWN;
+    }
+  }
+  return status;
+}
+};  // END NAMESPACE ML
+}
diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index f9e7c81b86..22dae5f6de 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -18,10 +18,8 @@
 #include <cuml/neighbors/knn_mg.hpp>
 #include <selection/knn.cuh>
 
-#include <common/cumlHandle.hpp>
-
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <raft/comms/comms.hpp>
 
@@ -913,4 +911,4 @@ void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t> &params,
 };  // namespace knn_common
 };  // namespace opg
 };  // namespace KNN
-};  // namespace ML
+};  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
index bbaf6d9c1c..684150e2d3 100644
--- a/cpp/src/knn/knn_sparse.cu
+++ b/cpp/src/knn/knn_sparse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <common/cumlHandle.hpp>
-
 #include <cuml/common/logger.hpp>
 #include <cuml/neighbors/knn_sparse.hpp>
 
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index e9b03bc75e..6a77f87848 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,13 +23,14 @@ namespace Metrics {
 
 /**
         * @brief Compute the trustworthiness score
-        * @input param X: Data in original dimension
-        * @input param X_embedded: Data in target dimension (embedding)
-        * @input param n: Number of samples
-        * @input param m: Number of features in high/original dimension
-        * @input param d: Number of features in low/embedded dimension
-        * @input param n_neighbors: Number of neighbors considered by trustworthiness score
-        * @input tparam distance_type: Distance type to consider
+        * @param X[in]: Data in original dimension
+        * @param X_embedded[in]: Data in target dimension (embedding)
+        * @param n[in]: Number of samples
+        * @param m[in]: Number of features in high/original dimension
+        * @param d[in]: Number of features in low/embedded dimension
+        * @param n_neighbors[in]: Number of neighbors considered by 
+        *   trustworthiness score
+        * @tparam distance_type: Distance type to consider
         * @return Trustworthiness score
         */
 template <typename math_t, raft::distance::DistanceType distance_type>
diff --git a/cpp/src/metrics/trustworthiness.cuh b/cpp/src/metrics/trustworthiness.cuh
index 5a86dfdf6f..4077e7be7c 100644
--- a/cpp/src/metrics/trustworthiness.cuh
+++ b/cpp/src/metrics/trustworthiness.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <common/cumlHandle.hpp>
 #include <distance/distance.cuh>
+#include <raft/handle.hpp>
 
 namespace ML {
 namespace Metrics {
diff --git a/cpp/src/pca/pca.cuh b/cpp/src/pca/pca.cuh
index 4d78272fdc..66bb29ee74 100644
--- a/cpp/src/pca/pca.cuh
+++ b/cpp/src/pca/pca.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,7 @@
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/transpose.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/cuml.hpp>
 #include <cuml/decomposition/params.hpp>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/pca/pca_mg.cu b/cpp/src/pca/pca_mg.cu
index db311dd409..ea242634f9 100644
--- a/cpp/src/pca/pca_mg.cu
+++ b/cpp/src/pca/pca_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,8 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/transpose.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/decomposition/pca.hpp>
 #include <cuml/decomposition/pca_mg.hpp>
 #include <cuml/decomposition/sign_flip_mg.hpp>
diff --git a/cpp/src/pca/sign_flip_mg.cu b/cpp/src/pca/sign_flip_mg.cu
index 9050975cf2..e64480dd7d 100644
--- a/cpp/src/pca/sign_flip_mg.cu
+++ b/cpp/src/pca/sign_flip_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <common/allocatorAdapter.hpp>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/decomposition/sign_flip_mg.hpp>
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/random_projection/rproj.cuh b/cpp/src/random_projection/rproj.cuh
index 37190cf7ba..a37264d8f4 100644
--- a/cpp/src/random_projection/rproj.cuh
+++ b/cpp/src/random_projection/rproj.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <common/cumlHandle.hpp>
 #include <raft/cuda_utils.cuh>
 #include "rproj_utils.cuh"
 
diff --git a/cpp/src/random_projection/rproj_utils.cuh b/cpp/src/random_projection/rproj_utils.cuh
index 38649acc39..d0c8c94019 100644
--- a/cpp/src/random_projection/rproj_utils.cuh
+++ b/cpp/src/random_projection/rproj_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #include <cuml/random_projection/rproj_c.h>
 #include <raft/cudart_utils.h>
 #include <sys/time.h>
-#include <common/cumlHandle.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/random/rng.cuh>
 
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 4438bbfa60..4ec5bfb2a2 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -18,21 +18,25 @@
 #else
 #define omp_get_max_threads() 1
 #endif
+
+#include <cuml/ensemble/randomforest.hpp>
+
 #include <cuml/tree/flatnode.h>
 #include <treelite/c_api.h>
 #include <treelite/tree.h>
+
+#include <cuml/common/logger.hpp>
+#include <raft/error.hpp>
+
+#include "randomforest_impl.cuh"
+
 #include <cstdio>
 #include <cstring>
-#include <cuml/common/logger.hpp>
-#include <cuml/ensemble/randomforest.hpp>
 #include <fstream>
 #include <iostream>
-#include <raft/error.hpp>
 #include <string>
 #include <vector>
 
-#include "randomforest_impl.cuh"
-
 namespace ML {
 
 using namespace MLCommon;
@@ -500,7 +504,7 @@ ModelHandle concatenate_trees(std::vector<ModelHandle> treelite_handles) {
 }
 
 /**
- * @defgroup Random Forest Classification - Fit function
+ * @defgroup RandomForestClassificationFit Random Forest Classification - Fit function
  * @brief Build (i.e., fit, train) random forest classifier for input data.
  * @param[in] user_handle: raft::handle_t
  * @param[in,out] forest: CPU pointer to RandomForestMetaData object. User allocated.
@@ -549,7 +553,7 @@ void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest,
 /** @} */
 
 /**
- * @defgroup Random Forest Classification - Predict function
+ * @defgroup RandomForestClassificationPredict Random Forest Classification - Predict function
  * @brief Predict target feature for input data; n-ary classification for
      single feature supported.
  * @param[in] user_handle: raft::handle_t.
@@ -584,7 +588,7 @@ void predict(const raft::handle_t& user_handle,
 /** @} */
 
 /**
- * @defgroup Random Forest Classification - Predict function
+ * @addtogroup RandomForestClassificationPredict
  * @brief Predict target feature for input data; n-ary classification for
      single feature supported.
  * @param[in] user_handle: raft::handle_t.
@@ -619,7 +623,7 @@ void predictGetAll(const raft::handle_t& user_handle,
 /** @} */
 
 /**
- * @defgroup Random Forest Classification - Score function
+ * @defgroup RandomForestClassificationScore Random Forest Classification - Score function
  * @brief Compare predicted features validate against ref_labels.
  * @param[in] user_handle: raft::handle_t.
  * @param[in] forest: CPU pointer to RandomForestMetaData object.
@@ -672,7 +676,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features,
 /** @} */
 
 /**
- * @defgroup Random Forest Regression - Fit function
+ * @defgroup RandomForestRegressorFit Random Forest Regression - Fit function
  * @brief Build (i.e., fit, train) random forest regressor for input data.
  * @param[in] user_handle: raft::handle_t
  * @param[in,out] forest: CPU pointer to RandomForestMetaData object. User allocated.
@@ -716,7 +720,7 @@ void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest,
 /** @} */
 
 /**
- * @defgroup Random Forest Regression - Predict function
+ * @defgroup RandomForestRegressorPredict Random Forest Regression - Predict function
  * @brief Predict target feature for input data; regression for single feature supported.
  * @param[in] user_handle: raft::handle_t.
  * @param[in] forest: CPU pointer to RandomForestMetaData object.
@@ -750,7 +754,7 @@ void predict(const raft::handle_t& user_handle,
 /** @} */
 
 /**
- * @defgroup Random Forest Regression - Score function
+ * @defgroup RandomForestRegressorScore Random Forest Regression - Score function
  * @brief Predict target feature for input data and validate against ref_labels.
  * @param[in] user_handle: raft::handle_t.
  * @param[in] forest: CPU pointer to RandomForestMetaData object.
diff --git a/cpp/src/solver/cd.cuh b/cpp/src/solver/cd.cuh
index 978052fff7..bede0d9e5d 100644
--- a/cpp/src/solver/cd.cuh
+++ b/cpp/src/solver/cd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/solvers/params.hpp>
 #include <functions/linearReg.cuh>
 #include <functions/penalty.cuh>
diff --git a/cpp/src/solver/cd_mg.cu b/cpp/src/solver/cd_mg.cu
index 1b48b958ca..cc31bb46b1 100644
--- a/cpp/src/solver/cd_mg.cu
+++ b/cpp/src/solver/cd_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,8 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/linear_model/preprocess_mg.hpp>
 #include <cuml/solvers/cd_mg.hpp>
 #include <functions/softThres.cuh>
diff --git a/cpp/src/solver/lars_impl.cuh b/cpp/src/solver/lars_impl.cuh
index e0553c3d09..b0c46cc320 100644
--- a/cpp/src/solver/lars_impl.cuh
+++ b/cpp/src/solver/lars_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,10 +31,9 @@
 #include <thrust/sort.h>
 #include <cache/cache_util.cuh>
 #include <common/allocatorAdapter.hpp>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
-#include <common/host_buffer.hpp>
 #include <cub/cub.cuh>
+#include <cuml/common/device_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/binary_op.cuh>
diff --git a/cpp/src/solver/sgd.cuh b/cpp/src/solver/sgd.cuh
index 530897f3b5..68770306bb 100644
--- a/cpp/src/solver/sgd.cuh
+++ b/cpp/src/solver/sgd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/gemv.h>
-#include <common/cumlHandle.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/solvers/params.hpp>
 #include <functions/hinge.cuh>
 #include <functions/linearReg.cuh>
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index 7d512389fb..2bd2149a4f 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -21,8 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <cache/cache.cuh>
 #include <cache/cache_util.cuh>
-#include <common/cumlHandle.hpp>
-#include <common/host_buffer.hpp>
 #include <cub/cub.cuh>
 #include <matrix/grammatrix.cuh>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/svm/results.cuh b/cpp/src/svm/results.cuh
index 2f5776c2f4..03030410d1 100644
--- a/cpp/src/svm/results.cuh
+++ b/cpp/src/svm/results.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,11 +24,9 @@
 
 #include <linalg/init.h>
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
-#include <common/host_buffer.hpp>
 #include <cub/device/device_select.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
diff --git a/cpp/src/svm/smoblocksolve.cuh b/cpp/src/svm/smoblocksolve.cuh
index 26324eb248..61d9917a00 100644
--- a/cpp/src/svm/smoblocksolve.cuh
+++ b/cpp/src/svm/smoblocksolve.cuh
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-/**@file smoblocksolve.h  contains implementation of the blocke SMO solver
+/**@file smoblocksolve.cuh  contains implementation of the blocke SMO solver
 */
 #pragma once
 
diff --git a/cpp/src/svm/smosolver.cuh b/cpp/src/svm/smosolver.cuh
index 1396650dac..78c4cd6624 100644
--- a/cpp/src/svm/smosolver.cuh
+++ b/cpp/src/svm/smosolver.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include <cuml/matrix/kernelparams.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/gemv.h>
-#include <common/cumlHandle.hpp>
 #include <cuml/common/logger.hpp>
 #include <matrix/grammatrix.cuh>
 #include <matrix/kernelfactory.cuh>
@@ -40,7 +39,7 @@
 #include "workingset.cuh"
 #include "ws_util.cuh"
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include "results.cuh"
 
 namespace ML {
diff --git a/cpp/src/svm/svc.cu b/cpp/src/svm/svc.cu
index b25327c969..486ae41e32 100644
--- a/cpp/src/svm/svc.cu
+++ b/cpp/src/svm/svc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <iostream>
 
 #include <raft/linalg/cublas_wrappers.h>
-#include <common/device_buffer.hpp>
 #include <cuml/svm/svc.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
diff --git a/cpp/src/svm/svc_impl.cuh b/cpp/src/svm/svc_impl.cuh
index 34f708bc24..c3420443e3 100644
--- a/cpp/src/svm/svc_impl.cuh
+++ b/cpp/src/svm/svc_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-/** @file svc_impl.h
+/** @file svc_impl.cuh
  * @brief Implementation of the stateless C++ functions to fit an SVM
  * classifier, and predict with it.
  */
@@ -30,8 +30,7 @@
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/src/svm/svm_api.cpp b/cpp/src/svm/svm_api.cpp
index 23499e6d12..eff78ee134 100644
--- a/cpp/src/svm/svm_api.cpp
+++ b/cpp/src/svm/svm_api.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,15 @@
  * limitations under the License.
  */
 
-#include <cuml/cuml_api.h>
-#include <cuml/matrix/kernelparams.h>
 #include <cuml/svm/svm_api.h>
+
+#include <cuml/matrix/kernelparams.h>
 #include <common/cumlHandle.hpp>
 #include <cuml/svm/svc.hpp>
 #include <tuple>
 
+extern "C" {
+
 cumlError_t cumlSpSvcFit(cumlHandle_t handle, float *input, int n_rows,
                          int n_cols, float *labels, float C, float cache_size,
                          int max_iter, int nochange_steps, float tol,
@@ -209,3 +211,4 @@ cumlError_t cumlDpSvcPredict(cumlHandle_t handle, double *input, int n_rows,
   }
   return status;
 }
+}
diff --git a/cpp/src/svm/svr.cu b/cpp/src/svm/svr.cu
index 3047e88cf4..7f3ae53771 100644
--- a/cpp/src/svm/svr.cu
+++ b/cpp/src/svm/svr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <iostream>
 
 #include <raft/linalg/cublas_wrappers.h>
-#include <common/device_buffer.hpp>
 #include <cuml/svm/svc.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
diff --git a/cpp/src/svm/svr_impl.cuh b/cpp/src/svm/svr_impl.cuh
index 257e1ce726..5c8612b17e 100644
--- a/cpp/src/svm/svr_impl.cuh
+++ b/cpp/src/svm/svr_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-/** @file svr_impl.h
+/** @file svr_impl.cuh
  * @brief Implementation of the stateless C++ functions to fit an SVM regressor.
  */
 
@@ -29,8 +29,6 @@
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/src/svm/workingset.cuh b/cpp/src/svm/workingset.cuh
index eee4fe6cb8..e083147106 100644
--- a/cpp/src/svm/workingset.cuh
+++ b/cpp/src/svm/workingset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,9 +22,8 @@
 #include <raft/cudart_utils.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/permutation_iterator.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/src/tsa/auto_arima.cu b/cpp/src/tsa/auto_arima.cu
index 999214341d..90ed28d927 100644
--- a/cpp/src/tsa/auto_arima.cu
+++ b/cpp/src/tsa/auto_arima.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include "auto_arima.cuh"
 
 #include <cuml/tsa/auto_arima.h>
-#include <common/cumlHandle.hpp>
 
 namespace ML {
 
diff --git a/cpp/src/tsa/auto_arima.cuh b/cpp/src/tsa/auto_arima.cuh
index 918be7d408..71df16b009 100644
--- a/cpp/src/tsa/auto_arima.cuh
+++ b/cpp/src/tsa/auto_arima.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,9 +28,9 @@
 #include <cub/device/device_scan.cuh>
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <common/fast_int_div.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 
 namespace ML {
 namespace TimeSeries {
diff --git a/cpp/src/tsa/stationarity.cu b/cpp/src/tsa/stationarity.cu
index ebf55f5f7e..3165895ce1 100644
--- a/cpp/src/tsa/stationarity.cu
+++ b/cpp/src/tsa/stationarity.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #include <cuml/tsa/stationarity.h>
 
-#include <common/cumlHandle.hpp>
 #include <timeSeries/stationarity.cuh>
 
 namespace ML {
diff --git a/cpp/src/tsne/barnes_hut.cuh b/cpp/src/tsne/barnes_hut.cuh
index be86affb90..2abbd446e4 100644
--- a/cpp/src/tsne/barnes_hut.cuh
+++ b/cpp/src/tsne/barnes_hut.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include "bh_kernels.cuh"
 #include "utils.cuh"
diff --git a/cpp/src/tsne/exact_tsne.cuh b/cpp/src/tsne/exact_tsne.cuh
index 105da4b369..854bdf9c81 100644
--- a/cpp/src/tsne/exact_tsne.cuh
+++ b/cpp/src/tsne/exact_tsne.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include "exact_kernels.cuh"
 #include "utils.cuh"
diff --git a/cpp/src/tsne/tsne_runner.cuh b/cpp/src/tsne/tsne_runner.cuh
index cc8eef9daf..242d0b4432 100644
--- a/cpp/src/tsne/tsne_runner.cuh
+++ b/cpp/src/tsne/tsne_runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <cuml/manifold/common.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/tsne/utils.cuh b/cpp/src/tsne/utils.cuh
index 07ad934579..0add717b1a 100644
--- a/cpp/src/tsne/utils.cuh
+++ b/cpp/src/tsne/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <stdbool.h>
 #include <stdio.h>
 
-#include <common/cumlHandle.hpp>
 #include <cuml/common/logger.hpp>
 #include <raft/linalg/norm.cuh>
 
diff --git a/cpp/src/tsvd/tsvd.cuh b/cpp/src/tsvd/tsvd.cuh
index 866d83dfc7..ab88692d50 100644
--- a/cpp/src/tsvd/tsvd.cuh
+++ b/cpp/src/tsvd/tsvd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <common/allocatorAdapter.hpp>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/decomposition/params.hpp>
 #include <linalg/rsvd.cuh>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src/tsvd/tsvd_mg.cu b/cpp/src/tsvd/tsvd_mg.cu
index 59be9ee6e2..f6e24817bd 100644
--- a/cpp/src/tsvd/tsvd_mg.cu
+++ b/cpp/src/tsvd/tsvd_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,8 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/decomposition/sign_flip_mg.hpp>
 #include <cuml/decomposition/tsvd.hpp>
 #include <cuml/decomposition/tsvd_mg.hpp>
diff --git a/cpp/src/umap/fuzzy_simpl_set/runner.cuh b/cpp/src/umap/fuzzy_simpl_set/runner.cuh
index 6836865bac..9401b4c6f7 100644
--- a/cpp/src/umap/fuzzy_simpl_set/runner.cuh
+++ b/cpp/src/umap/fuzzy_simpl_set/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,14 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #include <cuml/manifold/umapparams.h>
 #include "naive.cuh"
 
 #include <sparse/coo.cuh>
 
-#pragma once
-
 namespace UMAPAlgo {
 
 namespace FuzzySimplSet {
diff --git a/cpp/src/umap/init_embed/random_algo.cuh b/cpp/src/umap/init_embed/random_algo.cuh
index 81fdc0dc58..ea90ac5668 100644
--- a/cpp/src/umap/init_embed/random_algo.cuh
+++ b/cpp/src/umap/init_embed/random_algo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#include <cuml/manifold/umapparams.h>
-#include <raft/random/rng.cuh>
-
 #pragma once
 
+#include <cuml/manifold/umapparams.h>
+#include <raft/random/rng.cuh>
 namespace UMAPAlgo {
-
 namespace InitEmbed {
-
 namespace RandomInit {
 
 using namespace ML;
diff --git a/cpp/src/umap/init_embed/runner.cuh b/cpp/src/umap/init_embed/runner.cuh
index c3a4dbdaa5..fcf411ec2a 100644
--- a/cpp/src/umap/init_embed/runner.cuh
+++ b/cpp/src/umap/init_embed/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cuml/manifold/umapparams.h>
 
 #include <sparse/coo.cuh>
@@ -21,8 +23,6 @@
 #include "random_algo.cuh"
 #include "spectral_algo.cuh"
 
-#pragma once
-
 namespace UMAPAlgo {
 
 namespace InitEmbed {
diff --git a/cpp/src/umap/init_embed/spectral_algo.cuh b/cpp/src/umap/init_embed/spectral_algo.cuh
index 3ec13fdd2c..6d76431525 100644
--- a/cpp/src/umap/init_embed/spectral_algo.cuh
+++ b/cpp/src/umap/init_embed/spectral_algo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cuml/manifold/umapparams.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 
 #include <sparse/coo.cuh>
 
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index d940849a9a..ae27e74d53 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cuml/manifold/umapparams.h>
 #include <cuml/manifold/common.hpp>
 #include <cuml/neighbors/knn_sparse.hpp>
@@ -27,8 +29,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/error.hpp>
 
-#pragma once
-
 namespace UMAPAlgo {
 namespace kNNGraph {
 namespace Algo {
diff --git a/cpp/src/umap/knn_graph/runner.cuh b/cpp/src/umap/knn_graph/runner.cuh
index 4c0005c68e..5d1bbd1f0e 100644
--- a/cpp/src/umap/knn_graph/runner.cuh
+++ b/cpp/src/umap/knn_graph/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <cuml/manifold/common.hpp>
-#include <umap/knn_graph/algo.cuh>
-
 #pragma once
 
+#include <cuml/manifold/common.hpp>
+#include "algo.cuh"
+
 namespace UMAPAlgo {
 
 namespace kNNGraph {
diff --git a/cpp/src/umap/optimize.cuh b/cpp/src/umap/optimize.cuh
index 6ac130d34d..af7a439485 100644
--- a/cpp/src/umap/optimize.cuh
+++ b/cpp/src/umap/optimize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cuml/manifold/umapparams.h>
 #include <cuml/common/logger.hpp>
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 
 #include <raft/cudart_utils.h>
 #include <linalg/power.cuh>
diff --git a/cpp/src/umap/simpl_set_embed/runner.cuh b/cpp/src/umap/simpl_set_embed/runner.cuh
index 59c3b4c812..9db4b2946f 100644
--- a/cpp/src/umap/simpl_set_embed/runner.cuh
+++ b/cpp/src/umap/simpl_set_embed/runner.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cuml/manifold/umapparams.h>
 #include "algo.cuh"
 
 #include <sparse/coo.cuh>
 
-#pragma once
-
 namespace UMAPAlgo {
 
 namespace SimplSetEmbed {
diff --git a/cpp/src_prims/cache/cache.cuh b/cpp/src_prims/cache/cache.cuh
index e3a25b53a4..e0f03c5ee7 100644
--- a/cpp/src_prims/cache/cache.cuh
+++ b/cpp/src_prims/cache/cache.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include "cache_util.cuh"
diff --git a/cpp/src_prims/cache/cache_util.cuh b/cpp/src_prims/cache/cache_util.cuh
index a857e53c29..744c056214 100644
--- a/cpp/src_prims/cache/cache_util.cuh
+++ b/cpp/src_prims/cache/cache_util.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
 #include <selection/kselection.cuh>
diff --git a/cpp/src_prims/distance/distance.cuh b/cpp/src_prims/distance/distance.cuh
index ab876bc819..55168d3e4a 100644
--- a/cpp/src_prims/distance/distance.cuh
+++ b/cpp/src_prims/distance/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cuda_runtime_api.h>
 #include <cutlass/shape.h>
 #include <raft/linalg/distance_type.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include "cosine.cuh"
 #include "euclidean.cuh"
diff --git a/cpp/src_prims/distance/fused_l2_nn.cuh b/cpp/src_prims/distance/fused_l2_nn.cuh
index a32bafbf76..7e9db1159d 100644
--- a/cpp/src_prims/distance/fused_l2_nn.cuh
+++ b/cpp/src_prims/distance/fused_l2_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,14 +22,14 @@
 #include <linalg/contractions.cuh>
 #include <raft/cuda_utils.cuh>
 
+namespace MLCommon {
+namespace Distance {
+
 #if (ENABLE_MEMCPY_ASYNC == 1)
 #include <cuda_pipeline.h>
 using namespace nvcuda::experimental;
 #endif
 
-namespace MLCommon {
-namespace Distance {
-
 template <typename LabelT, typename DataT>
 struct KVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
diff --git a/cpp/src_prims/functions/hinge.cuh b/cpp/src_prims/functions/hinge.cuh
index 61a85344ab..e9646005c0 100644
--- a/cpp/src_prims/functions/hinge.cuh
+++ b/cpp/src_prims/functions/hinge.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/transpose.h>
-#include <common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/eltwise.cuh>
diff --git a/cpp/src_prims/functions/linearReg.cuh b/cpp/src_prims/functions/linearReg.cuh
index e84255cdd9..f2762e6ae6 100644
--- a/cpp/src_prims/functions/linearReg.cuh
+++ b/cpp/src_prims/functions/linearReg.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/transpose.h>
-#include <common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/eltwise.cuh>
diff --git a/cpp/src_prims/functions/logisticReg.cuh b/cpp/src_prims/functions/logisticReg.cuh
index ea5fa22ce2..e7d25a1e14 100644
--- a/cpp/src_prims/functions/logisticReg.cuh
+++ b/cpp/src_prims/functions/logisticReg.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/transpose.h>
-#include <common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/binary_op.cuh>
diff --git a/cpp/src_prims/label/classlabels.cuh b/cpp/src_prims/label/classlabels.cuh
index 4fbe1b3641..1129488a09 100644
--- a/cpp/src_prims/label/classlabels.cuh
+++ b/cpp/src_prims/label/classlabels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <cub/cub.cuh>
 
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 
diff --git a/cpp/src_prims/linalg/batched/matrix.cuh b/cpp/src_prims/linalg/batched/matrix.cuh
index 724a4eeeda..a9956bca70 100644
--- a/cpp/src_prims/linalg/batched/matrix.cuh
+++ b/cpp/src_prims/linalg/batched/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,8 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <common/fast_int_div.cuh>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/utils.hpp>
 #include <cuml/cuml.hpp>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh
index 0ae5fcd3ee..ccd205d7a0 100644
--- a/cpp/src_prims/linalg/lstsq.cuh
+++ b/cpp/src_prims/linalg/lstsq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,8 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/linalg/gemv.h>
 #include <raft/linalg/transpose.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
 #include <raft/linalg/gemm.cuh>
diff --git a/cpp/src_prims/linalg/rsvd.cuh b/cpp/src_prims/linalg/rsvd.cuh
index bdca7243a1..4b3b05fd16 100644
--- a/cpp/src_prims/linalg/rsvd.cuh
+++ b/cpp/src_prims/linalg/rsvd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
diff --git a/cpp/src_prims/metrics/adjusted_rand_index.cuh b/cpp/src_prims/metrics/adjusted_rand_index.cuh
index 1245469e83..7768bd7376 100644
--- a/cpp/src_prims/metrics/adjusted_rand_index.cuh
+++ b/cpp/src_prims/metrics/adjusted_rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/reduce.cuh>
diff --git a/cpp/src_prims/metrics/batched/silhouette_score.cuh b/cpp/src_prims/metrics/batched/silhouette_score.cuh
index 9c017170f3..1964344f0b 100644
--- a/cpp/src_prims/metrics/batched/silhouette_score.cuh
+++ b/cpp/src_prims/metrics/batched/silhouette_score.cuh
@@ -18,7 +18,7 @@
 
 #include "../silhouette_score.cuh"
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 
 #include <thrust/device_vector.h>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/src_prims/metrics/completeness_score.cuh b/cpp/src_prims/metrics/completeness_score.cuh
index b481a29154..19b145799a 100644
--- a/cpp/src_prims/metrics/completeness_score.cuh
+++ b/cpp/src_prims/metrics/completeness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 
 #pragma once
 
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
 #include "entropy.cuh"
 #include "mutual_info_score.cuh"
diff --git a/cpp/src_prims/metrics/dispersion.cuh b/cpp/src_prims/metrics/dispersion.cuh
index 08fdc654ac..40e1dbe707 100644
--- a/cpp/src_prims/metrics/dispersion.cuh
+++ b/cpp/src_prims/metrics/dispersion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <memory>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eltwise.cuh>
diff --git a/cpp/src_prims/metrics/entropy.cuh b/cpp/src_prims/metrics/entropy.cuh
index f0520200b3..f9415a71e9 100644
--- a/cpp/src_prims/metrics/entropy.cuh
+++ b/cpp/src_prims/metrics/entropy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/divide.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
diff --git a/cpp/src_prims/metrics/kl_divergence.cuh b/cpp/src_prims/metrics/kl_divergence.cuh
index d0a84edf7e..474f3d456a 100644
--- a/cpp/src_prims/metrics/kl_divergence.cuh
+++ b/cpp/src_prims/metrics/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,8 +23,8 @@
 
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 
diff --git a/cpp/src_prims/metrics/mutual_info_score.cuh b/cpp/src_prims/metrics/mutual_info_score.cuh
index f736a2e3a7..ca2c71e698 100644
--- a/cpp/src_prims/metrics/mutual_info_score.cuh
+++ b/cpp/src_prims/metrics/mutual_info_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,9 @@
 
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/reduce.cuh>
 #include "contingencyMatrix.cuh"
diff --git a/cpp/src_prims/metrics/pairwise_distance.cuh b/cpp/src_prims/metrics/pairwise_distance.cuh
index 45aa537829..2853b29efa 100644
--- a/cpp/src_prims/metrics/pairwise_distance.cuh
+++ b/cpp/src_prims/metrics/pairwise_distance.cuh
@@ -1,60 +1,60 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
-#include <cuml/common/cuml_allocator.hpp>
-#include <distance/distance.cuh>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
-
-namespace MLCommon {
-namespace Metrics {
-
-/**
- * @brief Function to calculate the distance between each ij pair in the input
-          array
- *
- * @tparam DataT type of the data samples
- * @tparam IndexT typeof the index
- * @param x pointer to the input data samples array (mRows x kCols)
- * @param y pointer to the second input data samples array. Can use the same
- *          pointer as x (nRows x kCols)
- * @param dist output pointer where the results will be stored (mRows x nCols)
- * @param m number of rows in x
- * @param n number of rows in y
- * @param k number of cols in x and y (must be the same)
- * @param metric the distance metric to use for the calculation
- * @param allocator default allocator to allocate device memory
- * @param stream the cuda stream where to launch this kernel
- * @param isRowMajor specifies whether the x and y data pointers are row (C
- *                   type array) or col (F type array) major
- */
-template <typename DataT, typename IndexT>
-void pairwise_distance(const DataT *x, const DataT *y, DataT *dist, IndexT m,
-                       IndexT n, IndexT k, raft::distance::DistanceType metric,
-                       std::shared_ptr<MLCommon::deviceAllocator> allocator,
-                       cudaStream_t stream, bool isRowMajor = true) {
-  //Allocate workspace
-  MLCommon::device_buffer<char> workspace(allocator, stream, 1);
-
-  //Call the distance function
-  Distance::pairwise_distance(x, y, dist, m, n, k, workspace, metric, stream,
-                              isRowMajor);
-}
-
-};  // namespace Metrics
-};  // namespace MLCommon
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cudart_utils.h>
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
+#include <distance/distance.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.cuh>
+
+namespace MLCommon {
+namespace Metrics {
+
+/**
+ * @brief Function to calculate the distance between each ij pair in the input
+          array
+ *
+ * @tparam DataT type of the data samples
+ * @tparam IndexT typeof the index
+ * @param x pointer to the input data samples array (mRows x kCols)
+ * @param y pointer to the second input data samples array. Can use the same
+ *          pointer as x (nRows x kCols)
+ * @param dist output pointer where the results will be stored (mRows x nCols)
+ * @param m number of rows in x
+ * @param n number of rows in y
+ * @param k number of cols in x and y (must be the same)
+ * @param metric the distance metric to use for the calculation
+ * @param allocator default allocator to allocate device memory
+ * @param stream the cuda stream where to launch this kernel
+ * @param isRowMajor specifies whether the x and y data pointers are row (C
+ *                   type array) or col (F type array) major
+ */
+template <typename DataT, typename IndexT>
+void pairwise_distance(const DataT *x, const DataT *y, DataT *dist, IndexT m,
+                       IndexT n, IndexT k, raft::distance::DistanceType metric,
+                       std::shared_ptr<MLCommon::deviceAllocator> allocator,
+                       cudaStream_t stream, bool isRowMajor = true) {
+  //Allocate workspace
+  MLCommon::device_buffer<char> workspace(allocator, stream, 1);
+
+  //Call the distance function
+  Distance::pairwise_distance(x, y, dist, m, n, k, workspace, metric, stream,
+                              isRowMajor);
+}
+
+};  // namespace Metrics
+};  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/rand_index.cuh b/cpp/src_prims/metrics/rand_index.cuh
index b911772aa7..5c73571b02 100644
--- a/cpp/src_prims/metrics/rand_index.cuh
+++ b/cpp/src_prims/metrics/rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,9 +44,9 @@
 
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 
 namespace MLCommon {
diff --git a/cpp/src_prims/metrics/silhouette_score.cuh b/cpp/src_prims/metrics/silhouette_score.cuh
index f12bbfc4f3..a4c5e3306c 100644
--- a/cpp/src_prims/metrics/silhouette_score.cuh
+++ b/cpp/src_prims/metrics/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <algorithm>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <distance/distance.cuh>
 #include <iostream>
 #include <linalg/reduce_cols_by_key.cuh>
diff --git a/cpp/src_prims/random/make_blobs.cuh b/cpp/src_prims/random/make_blobs.cuh
index ff9cb5c79d..0fae3b3c0a 100644
--- a/cpp/src_prims/random/make_blobs.cuh
+++ b/cpp/src_prims/random/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/rng.cuh>
diff --git a/cpp/src_prims/random/make_regression.cuh b/cpp/src_prims/random/make_regression.cuh
index fbcc05d18a..13ec80c631 100644
--- a/cpp/src_prims/random/make_regression.cuh
+++ b/cpp/src_prims/random/make_regression.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/transpose.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/handle.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/qr.cuh>
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index 4d3a0f5dc7..032d6d1c92 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,10 +37,10 @@
 #include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <selection/processing.cuh>
+#include "processing.cuh"
 
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/neighbors/knn.hpp>
 
 #include <iostream>
diff --git a/cpp/src_prims/selection/processing.cuh b/cpp/src_prims/selection/processing.cuh
index 911bea4da1..4bc05a0362 100644
--- a/cpp/src_prims/selection/processing.cuh
+++ b/cpp/src_prims/selection/processing.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c)2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include <raft/stats/mean.cuh>
 #include <raft/stats/mean_center.cuh>
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 
 #include <cuml/common/cuml_allocator.hpp>
 
diff --git a/cpp/src_prims/sparse/batched/csr.cuh b/cpp/src_prims/sparse/batched/csr.cuh
index c7b95e7bc3..da5a33f07c 100644
--- a/cpp/src_prims/sparse/batched/csr.cuh
+++ b/cpp/src_prims/sparse/batched/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cusolver_wrappers.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <raft/matrix/matrix.cuh>
 
diff --git a/cpp/src_prims/sparse/convert/coo.cuh b/cpp/src_prims/sparse/convert/coo.cuh
index 21ea45a0ef..f081427f15 100644
--- a/cpp/src_prims/sparse/convert/coo.cuh
+++ b/cpp/src_prims/sparse/convert/coo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,8 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
-#include <sparse/coo.cuh>
+#include "../coo.cuh"
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/convert/csr.cuh b/cpp/src_prims/sparse/convert/csr.cuh
index 0f5ce6d10f..f7f9222882 100644
--- a/cpp/src_prims/sparse/convert/csr.cuh
+++ b/cpp/src_prims/sparse/convert/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,10 +34,10 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
-#include <sparse/coo.cuh>
-#include <sparse/linalg/degree.cuh>
-#include <sparse/op/row_op.cuh>
+#include "../coo.cuh"
+#include "../linalg/degree.cuh"
+#include "../op/row_op.cuh"
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/convert/dense.cuh b/cpp/src_prims/sparse/convert/dense.cuh
index 772596f6df..eab806fd41 100644
--- a/cpp/src_prims/sparse/convert/dense.cuh
+++ b/cpp/src_prims/sparse/convert/dense.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/coo.cuh b/cpp/src_prims/sparse/coo.cuh
index 520f29d292..e6e377f5c1 100644
--- a/cpp/src_prims/sparse/coo.cuh
+++ b/cpp/src_prims/sparse/coo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
@@ -32,8 +34,6 @@
 #include <iostream>
 #define restrict __restrict__
 
-#pragma once
-
 namespace raft {
 namespace sparse {
 
diff --git a/cpp/src_prims/sparse/csr.cuh b/cpp/src_prims/sparse/csr.cuh
index 369ff397d8..7d47ee5771 100644
--- a/cpp/src_prims/sparse/csr.cuh
+++ b/cpp/src_prims/sparse/csr.cuh
@@ -35,7 +35,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
+#include "utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/distance/bin_distance.cuh b/cpp/src_prims/sparse/distance/bin_distance.cuh
index 954a6fe4b5..fcf1538e21 100644
--- a/cpp/src_prims/sparse/distance/bin_distance.cuh
+++ b/cpp/src_prims/sparse/distance/bin_distance.cuh
@@ -26,9 +26,9 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/distance/common.h>
-#include <sparse/utils.h>
-#include <sparse/distance/ip_distance.cuh>
+#include "../utils.h"
+#include "common.h"
+#include "ip_distance.cuh"
 
 #include <nvfunctional>
 
diff --git a/cpp/src_prims/sparse/distance/coo_spmv.cuh b/cpp/src_prims/sparse/distance/coo_spmv.cuh
index 4bacd76b69..e80bf8ea3c 100644
--- a/cpp/src_prims/sparse/distance/coo_spmv.cuh
+++ b/cpp/src_prims/sparse/distance/coo_spmv.cuh
@@ -22,9 +22,9 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/distance/common.h>
-#include <sparse/utils.h>
-#include <sparse/csr.cuh>
+#include "../csr.cuh"
+#include "../utils.h"
+#include "common.h"
 
 #include <limits.h>
 
diff --git a/cpp/src_prims/sparse/distance/csr_spmv.cuh b/cpp/src_prims/sparse/distance/csr_spmv.cuh
index 03da497561..5e10e41d5b 100644
--- a/cpp/src_prims/sparse/distance/csr_spmv.cuh
+++ b/cpp/src_prims/sparse/distance/csr_spmv.cuh
@@ -22,12 +22,10 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <common/allocatorAdapter.hpp>
-
-#include <sparse/distance/common.h>
-#include <sparse/utils.h>
-#include <sparse/csr.cuh>
-#include <sparse/distance/operators.cuh>
+#include "../csr.cuh"
+#include "../utils.h"
+#include "common.h"
+#include "operators.cuh"
 
 #include <limits.h>
 
diff --git a/cpp/src_prims/sparse/distance/distance.cuh b/cpp/src_prims/sparse/distance/distance.cuh
index 046bd3b9a7..dd697edf6c 100644
--- a/cpp/src_prims/sparse/distance/distance.cuh
+++ b/cpp/src_prims/sparse/distance/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,17 +24,17 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/linalg/transpose.h>
-#include <sparse/utils.h>
-#include <sparse/convert/coo.cuh>
-#include <sparse/convert/csr.cuh>
-#include <sparse/convert/dense.cuh>
-#include <sparse/csr.cuh>
+#include "../convert/coo.cuh"
+#include "../convert/csr.cuh"
+#include "../convert/dense.cuh"
+#include "../csr.cuh"
+#include "../linalg/transpose.h"
+#include "../utils.h"
 
-#include <sparse/distance/bin_distance.cuh>
-#include <sparse/distance/ip_distance.cuh>
-#include <sparse/distance/l2_distance.cuh>
-#include <sparse/distance/lp_distance.cuh>
+#include "bin_distance.cuh"
+#include "ip_distance.cuh"
+#include "l2_distance.cuh"
+#include "lp_distance.cuh"
 
 #include <cusparse_v2.h>
 
diff --git a/cpp/src_prims/sparse/distance/ip_distance.cuh b/cpp/src_prims/sparse/distance/ip_distance.cuh
index a6326f29dc..2fe64d86c1 100644
--- a/cpp/src_prims/sparse/distance/ip_distance.cuh
+++ b/cpp/src_prims/sparse/distance/ip_distance.cuh
@@ -25,13 +25,13 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/distance/common.h>
-#include <sparse/linalg/transpose.h>
-#include <sparse/utils.h>
-#include <sparse/convert/csr.cuh>
-#include <sparse/convert/dense.cuh>
-#include <sparse/distance/coo_spmv.cuh>
-#include <sparse/distance/operators.cuh>
+#include "../convert/csr.cuh"
+#include "../convert/dense.cuh"
+#include "../linalg/transpose.h"
+#include "../utils.h"
+#include "common.h"
+#include "coo_spmv.cuh"
+#include "operators.cuh"
 
 #include <nvfunctional>
 
diff --git a/cpp/src_prims/sparse/distance/l2_distance.cuh b/cpp/src_prims/sparse/distance/l2_distance.cuh
index 17d3b78653..34bb9128f3 100644
--- a/cpp/src_prims/sparse/distance/l2_distance.cuh
+++ b/cpp/src_prims/sparse/distance/l2_distance.cuh
@@ -16,29 +16,24 @@
 
 #pragma once
 
-#include <limits.h>
-#include <cmath>
-
-#include <raft/cudart_utils.h>
-#include <sparse/distance/common.h>
+#include <cuml/neighbors/knn.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/utils.h>
-#include <sparse/csr.cuh>
-
-#include <sparse/distance/common.h>
-#include <sparse/distance/ip_distance.cuh>
+#include "../csr.cuh"
+#include "../utils.h"
 
-#include <cuml/neighbors/knn.hpp>
+#include "common.h"
+#include "ip_distance.cuh"
 
+#include <limits.h>
+#include <cmath>
 #include <nvfunctional>
 
 namespace raft {
diff --git a/cpp/src_prims/sparse/distance/lp_distance.cuh b/cpp/src_prims/sparse/distance/lp_distance.cuh
index 17784f5099..1bb23ea784 100644
--- a/cpp/src_prims/sparse/distance/lp_distance.cuh
+++ b/cpp/src_prims/sparse/distance/lp_distance.cuh
@@ -26,13 +26,13 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/utils.h>
-#include <sparse/csr.cuh>
+#include "../csr.cuh"
+#include "../utils.h"
 
-#include <sparse/distance/common.h>
-#include <sparse/convert/coo.cuh>
-#include <sparse/distance/csr_spmv.cuh>
-#include <sparse/distance/operators.cuh>
+#include "../convert/coo.cuh"
+#include "common.h"
+#include "csr_spmv.cuh"
+#include "operators.cuh"
 
 #include <nvfunctional>
 
diff --git a/cpp/src_prims/sparse/linalg/add.cuh b/cpp/src_prims/sparse/linalg/add.cuh
index 4ea2d009ec..51d69f9239 100644
--- a/cpp/src_prims/sparse/linalg/add.cuh
+++ b/cpp/src_prims/sparse/linalg/add.cuh
@@ -33,7 +33,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/linalg/degree.cuh b/cpp/src_prims/sparse/linalg/degree.cuh
index b21f9c5a25..9a2a404b32 100644
--- a/cpp/src_prims/sparse/linalg/degree.cuh
+++ b/cpp/src_prims/sparse/linalg/degree.cuh
@@ -27,8 +27,8 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
-#include <sparse/utils.h>
-#include <sparse/coo.cuh>
+#include "../coo.cuh"
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/linalg/norm.cuh b/cpp/src_prims/sparse/linalg/norm.cuh
index e5ef8e2c62..fe291d6e39 100644
--- a/cpp/src_prims/sparse/linalg/norm.cuh
+++ b/cpp/src_prims/sparse/linalg/norm.cuh
@@ -30,7 +30,7 @@
 #include <iostream>
 #include <limits>
 
-#include <sparse/utils.h>
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/linalg/spectral.cuh b/cpp/src_prims/sparse/linalg/spectral.cuh
index 74c2a27bb1..c6f22d3f01 100644
--- a/cpp/src_prims/sparse/linalg/spectral.cuh
+++ b/cpp/src_prims/sparse/linalg/spectral.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,8 +23,8 @@
 #include <raft/spectral/partition.hpp>
 
 #include <selection/knn.cuh>
-#include <sparse/convert/csr.cuh>
-#include <sparse/coo.cuh>
+#include "../convert/csr.cuh"
+#include "../coo.cuh"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/linalg/symmetrize.cuh b/cpp/src_prims/sparse/linalg/symmetrize.cuh
index 190370708f..bdf2c03671 100644
--- a/cpp/src_prims/sparse/linalg/symmetrize.cuh
+++ b/cpp/src_prims/sparse/linalg/symmetrize.cuh
@@ -35,9 +35,9 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
-#include <sparse/convert/csr.cuh>
-#include <sparse/coo.cuh>
+#include "../convert/csr.cuh"
+#include "../coo.cuh"
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/linalg/transpose.h b/cpp/src_prims/sparse/linalg/transpose.h
index 551b828407..a4c852a694 100644
--- a/cpp/src_prims/sparse/linalg/transpose.h
+++ b/cpp/src_prims/sparse/linalg/transpose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/op/filter.cuh b/cpp/src_prims/sparse/op/filter.cuh
index 4402cfd9e1..aa6b498f3c 100644
--- a/cpp/src_prims/sparse/op/filter.cuh
+++ b/cpp/src_prims/sparse/op/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,9 +33,9 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
-#include <sparse/coo.cuh>
-#include <sparse/linalg/degree.cuh>
+#include "../coo.cuh"
+#include "../linalg/degree.cuh"
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/op/row_op.cuh b/cpp/src_prims/sparse/op/row_op.cuh
index e08b096af3..0e679fe957 100644
--- a/cpp/src_prims/sparse/op/row_op.cuh
+++ b/cpp/src_prims/sparse/op/row_op.cuh
@@ -31,7 +31,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/op/slice.h b/cpp/src_prims/sparse/op/slice.h
index 5fa54ed1b5..5380f9be04 100644
--- a/cpp/src_prims/sparse/op/slice.h
+++ b/cpp/src_prims/sparse/op/slice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/op/sort.h b/cpp/src_prims/sparse/op/sort.h
index 277ee61b91..105ece1a24 100644
--- a/cpp/src_prims/sparse/op/sort.h
+++ b/cpp/src_prims/sparse/op/sort.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,8 +33,8 @@
 #include <algorithm>
 #include <iostream>
 
-#include <sparse/utils.h>
-#include <sparse/coo.cuh>
+#include "../coo.cuh"
+#include "../utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/src_prims/sparse/selection/knn.cuh b/cpp/src_prims/sparse/selection/knn.cuh
index d29a402623..44b2f6a091 100644
--- a/cpp/src_prims/sparse/selection/knn.cuh
+++ b/cpp/src_prims/sparse/selection/knn.cuh
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
+
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
@@ -23,28 +28,16 @@
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <sparse/op/slice.h>
-#include <sparse/utils.h>
 #include <selection/knn.cuh>
-#include <sparse/coo.cuh>
-#include <sparse/csr.cuh>
-#include <sparse/distance/distance.cuh>
-#include <sparse/selection/selection.cuh>
-
-#include <raft/linalg/distance_type.h>
-
-#include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
-#include <cuml/common/cuml_allocator.hpp>
-
-#include <raft/cuda_utils.cuh>
-
-#include <raft/sparse/cusparse_wrappers.h>
+#include "../coo.cuh"
+#include "../csr.cuh"
+#include "../distance/distance.cuh"
+#include "../op/slice.h"
+#include "../utils.h"
+#include "selection.cuh"
 
 #include <cusparse_v2.h>
 
-#pragma once
-
 namespace raft {
 namespace sparse {
 namespace selection {
diff --git a/cpp/src_prims/sparse/selection/selection.cuh b/cpp/src_prims/sparse/selection/selection.cuh
index d3c2095c59..2b727f5005 100644
--- a/cpp/src_prims/sparse/selection/selection.cuh
+++ b/cpp/src_prims/sparse/selection/selection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/matrix/matrix.cuh>
 
-#include <sparse/coo.cuh>
-#include <sparse/csr.cuh>
-#include <sparse/distance/distance.cuh>
+#include "../coo.cuh"
+#include "../csr.cuh"
+#include "../distance/distance.cuh"
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
diff --git a/cpp/src_prims/timeSeries/jones_transform.cuh b/cpp/src_prims/timeSeries/jones_transform.cuh
index 5d37bd15ae..2bfd890020 100644
--- a/cpp/src_prims/timeSeries/jones_transform.cuh
+++ b/cpp/src_prims/timeSeries/jones_transform.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/src_prims/timeSeries/stationarity.cuh b/cpp/src_prims/timeSeries/stationarity.cuh
index 1afa4fb275..173c8f0dbb 100644
--- a/cpp/src_prims/timeSeries/stationarity.cuh
+++ b/cpp/src_prims/timeSeries/stationarity.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,12 +35,12 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/stats/mean.cuh>
-#include <timeSeries/arima_helpers.cuh>
+#include "arima_helpers.cuh"
 
 namespace MLCommon {
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index e3b8513178..32f5c691f3 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -15,7 +15,6 @@
 #=============================================================================
 
 set(CUML_TEST_INCLUDE_DIRS
-  ${CUML_INCLUDE_DIRECTORIES}
   ${GTEST_INCLUDE_DIRS}/include
   ${TREELITE_DIR}/runtime/native/include)
 
@@ -83,9 +82,14 @@ if(BUILD_CUML_TESTS)
 
   add_dependencies(ml cutlass)
 
-  target_include_directories(ml PRIVATE ${CUML_TEST_INCLUDE_DIRS})
+  target_include_directories(ml
+    PRIVATE
+      ${CUML_SRC_DIR}
+      ${CUML_SRC_PRIMS_DIR}
+      ${CUML_TEST_PRIMS_DIR}
+  )
 
-  target_link_libraries(ml ${CUML_TEST_LINK_LIBRARIES})
+  target_link_libraries(ml ${CUML_TEST_LINK_LIBRARIES} ${CUML_C_TARGET})
 
 endif(BUILD_CUML_TESTS)
 
@@ -109,7 +113,11 @@ if(BUILD_CUML_MG_TESTS)
         ${MPI_CXX_INCLUDE_PATH}
         ${cumlprims_mg_INCLUDE_DIRS})
 
-    target_include_directories(ml_mg PUBLIC ${CUML_TEST_INCLUDE_DIRS})
+    target_include_directories(ml_mg
+      PRIVATE
+        ${CUML_SRC_DIR}
+        ${CUML_SRC_PRIMS_DIR}
+        ${CUML_TEST_PRIMS_DIR})
 
     set(CUML_TEST_LINK_LIBRARIES
     	${CUML_TEST_LINK_LIBRARIES}
@@ -214,17 +222,39 @@ if(BUILD_PRIMS_TESTS)
     prims/ternary_op.cu
     prims/trustworthiness.cu
     prims/v_measure.cu
-    prims/weighted_mean.cu
-    ../src/common/logger.cpp)  # because prims is header only!
+    prims/weighted_mean.cu)
 
-  target_include_directories(prims PRIVATE ${CUML_TEST_INCLUDE_DIRS})
+  target_include_directories(prims
+    PRIVATE
+      ${CUML_SRC_PRIMS_DIR}
+      ${CUML_TEST_PRIMS_DIR}
+  )
 
   add_dependencies(prims cutlass)
 
   target_link_libraries(prims
+    ${CUML_CPP_TARGET}
   	GTest::GTest
   	GTest::Main
   	FAISS::FAISS
     ${PRIMS_TEST_LINK_LIBRARIES})
 
 endif(BUILD_PRIMS_TESTS)
+
+##############################################################################
+# - build C-API test library -------------------------------------------------
+
+if(BUILD_CUML_C_LIBRARY)
+  set(CUML_C_TEST_TARGET "${CUML_C_TARGET}_test")
+
+  add_library(${CUML_C_TEST_TARGET} SHARED
+    c_api/dbscan_api_test.c
+    c_api/glm_api_test.c
+    c_api/holtwinters_api_test.c
+    c_api/knn_api_test.c
+    c_api/svm_api_test.c
+  )
+
+  target_link_libraries(${CUML_C_TEST_TARGET} PUBLIC ${CUML_C_TARGET})
+
+endif(BUILD_CUML_C_LIBRARY)
\ No newline at end of file
diff --git a/cpp/test/c_api/README.md b/cpp/test/c_api/README.md
new file mode 100644
index 0000000000..d0632dfa64
--- /dev/null
+++ b/cpp/test/c_api/README.md
@@ -0,0 +1,20 @@
+# C-API Test Folder
+
+The purpose of this folder and CMake target is to verify it's possible to build an executable/library using the C-API. Since the C-API is compiled using C++ and `extern "C"`, this verifies there are no additional oversights that would prevent users from consuming the C-API. For example, including any of the C++ standard library headers would not prevent compiling `libcuml.so`, but would cause errors when using that library.
+
+This test works by simply `#include`'ing each of the `*_api.h` header files and calling each method with dummy arguments. This ensures the functions are properly imported into a C application and linked without needing to actually call the methods.
+
+## Adding New Headers or Functions
+
+Any changes to the C-API need to be reflected in these tests. If a new `*_api.h` header is added or a new function appended to existing headers, this folder should be updated to reflect the change.
+
+## Macro Definitions
+
+To help prevent accidentally including the C-API files when compiling `libcuml++.so`, two new defines have been created: `CUML_C_API` and `CUML_CPP_API`. Each is set when compiling their respective libraries and can be used to prevent accidentally including the wrong header files. For example, in `cuml_api.h`, the following section will raise an error during compilation if included into the C++ API:
+
+```cpp
+#ifdef CUML_CPP_API
+#error \
+  "This header is only for the C-API and should not be included from the C++ API."
+#endif
+```
\ No newline at end of file
diff --git a/cpp/test/c_api/dbscan_api_test.c b/cpp/test/c_api/dbscan_api_test.c
new file mode 100644
index 0000000000..e44f59a1e7
--- /dev/null
+++ b/cpp/test/c_api/dbscan_api_test.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/cluster/dbscan_api.h>
+
+void test_dbscan() {
+
+   cumlHandle_t handle = 0;
+   cumlError_t response = CUML_SUCCESS;
+
+   response = cumlSpDbscanFit(handle, NULL, 0, 1, 1.0f, 2, NULL, NULL, 10, 1);
+
+   response = cumlDpDbscanFit(handle, NULL, 0, 1, 1.0, 2, NULL, NULL, 10, 1);
+}
diff --git a/cpp/test/c_api/glm_api_test.c b/cpp/test/c_api/glm_api_test.c
new file mode 100644
index 0000000000..1bf24279ac
--- /dev/null
+++ b/cpp/test/c_api/glm_api_test.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/linear_model/glm_api.h>
+
+void test_glm() {
+
+   cumlHandle_t handle = 0;
+   cumlError_t response = CUML_SUCCESS;
+
+   response = cumlSpQnFit(handle, NULL, NULL, 0, 1, 2, false, 1.0f, 2.0f, 3, 3.0f, 4, 5, 6, NULL, NULL, NULL, true, 7);
+
+   response = cumlDpQnFit(handle, NULL, NULL, 0, 1, 2, false, 1.0f, 2.0f, 3, 3.0f, 4, 5, 6, NULL, NULL, NULL, true, 7);
+
+}
\ No newline at end of file
diff --git a/cpp/test/c_api/holtwinters_api_test.c b/cpp/test/c_api/holtwinters_api_test.c
new file mode 100644
index 0000000000..6a2e60935b
--- /dev/null
+++ b/cpp/test/c_api/holtwinters_api_test.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/tsa/holtwinters_api.h>
+
+void test_holtwinters() {
+
+   cumlHandle_t handle = 0;
+   cumlError_t response = CUML_SUCCESS;
+
+   response = cumlHoltWinters_buffer_size(0, 1, 2, NULL, NULL, NULL, NULL, NULL, NULL);
+
+   response = cumlHoltWintersSp_fit(handle, 0, 1, 2, 3, ADDITIVE, 1.0f, NULL, NULL, NULL, NULL, NULL);
+
+   response = cumlHoltWintersDp_fit(handle, 0, 1, 2, 3, ADDITIVE, 1.0f, NULL, NULL, NULL, NULL, NULL);
+
+   response = cumlHoltWintersSp_forecast(handle, 0, 1, 2, 3, ADDITIVE, NULL, NULL, NULL, NULL);
+
+   response = cumlHoltWintersDp_forecast(handle, 0, 1, 2, 3, ADDITIVE, NULL, NULL, NULL, NULL);
+}
\ No newline at end of file
diff --git a/cpp/test/c_api/knn_api_test.c b/cpp/test/c_api/knn_api_test.c
new file mode 100644
index 0000000000..e8b8bdf07b
--- /dev/null
+++ b/cpp/test/c_api/knn_api_test.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/neighbors/knn_api.h>
+
+void test_knn() {
+
+   cumlHandle_t handle = 0;
+   cumlError_t response = CUML_SUCCESS;
+
+   response = knn_search(handle, NULL, NULL, 1, 2, NULL, 3, NULL, NULL, 4, true, false, 0, 2.0f, false);
+}
\ No newline at end of file
diff --git a/cpp/test/c_api/svm_api_test.c b/cpp/test/c_api/svm_api_test.c
new file mode 100644
index 0000000000..f7792839f7
--- /dev/null
+++ b/cpp/test/c_api/svm_api_test.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuml/svm/svm_api.h>
+
+void test_svm() {
+
+   cumlHandle_t handle = 0;
+   cumlError_t response = CUML_SUCCESS;
+
+   response = cumlSpSvcFit(handle, NULL, 0, 1, NULL, 1.0f, 2.0f, 2, 3, 3.0f, 4, LINEAR, 5, 6.0f, 7.0f, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+
+   response = cumlDpSvcFit(handle, NULL, 0, 1, NULL, 1.0, 2.0, 2, 3, 3.0, 4, LINEAR, 5, 6.0, 7.0, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+
+   response = cumlSpSvcPredict(handle, NULL, 0, 1, LINEAR, 2, 3.0f, 4.0f, 5, 6.0f, NULL, NULL, 7, NULL, NULL, 8.0f, 9);
+
+   response = cumlDpSvcPredict(handle, NULL, 0, 1, LINEAR, 2, 3.0, 4.0, 5, 6.0, NULL, NULL, 7, NULL, NULL, 8.0, 9);
+}
\ No newline at end of file
diff --git a/cpp/test/mg/knn.cu b/cpp/test/mg/knn.cu
index d72dc3d436..451b19a46b 100644
--- a/cpp/test/mg/knn.cu
+++ b/cpp/test/mg/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,10 @@
 #include "../prims/test_utils.h"
 #include "test_opg_utils.h"
 
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <raft/comms/mpi_comms.hpp>
 
-#include <common/cumlHandle.hpp>
-
 #include <raft/cuda_utils.cuh>
 
 namespace ML {
diff --git a/cpp/test/mg/knn_test_helper.cuh b/cpp/test/mg/knn_test_helper.cuh
index 7a3000b696..e161287016 100644
--- a/cpp/test/mg/knn_test_helper.cuh
+++ b/cpp/test/mg/knn_test_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,11 +26,8 @@
 #include <linalg/reduce_rows_by_key.cuh>
 #include <selection/knn.cuh>
 
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
 
-#include <common/cumlHandle.hpp>
-
 #include <raft/cuda_utils.cuh>
 
 namespace ML {
diff --git a/cpp/test/mg/pca.cu b/cpp/test/mg/pca.cu
index b70e83412a..c494ebbebb 100644
--- a/cpp/test/mg/pca.cu
+++ b/cpp/test/mg/pca.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <test_utils.h>
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <cuml/decomposition/pca_mg.hpp>
 #include <opg/linalg/gemm.hpp>
@@ -27,8 +27,6 @@
 #include <raft/matrix/matrix.cuh>
 #include "test_opg_utils.h"
 
-#include <common/cumlHandle.hpp>
-
 #include <raft/comms/mpi_comms.hpp>
 
 namespace MLCommon {
diff --git a/cpp/test/mg/test_opg_utils.h b/cpp/test/mg/test_opg_utils.h
index a30e657a4f..f79459d847 100644
--- a/cpp/test/mg/test_opg_utils.h
+++ b/cpp/test/mg/test_opg_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 #include <mpi.h>
 #include <raft/cuda_utils.cuh>
 
-#include <common/cumlHandle.hpp>
-
 namespace MLCommon {
 namespace Test {
 namespace opg {
diff --git a/cpp/test/prims/epsilon_neighborhood.cu b/cpp/test/prims/epsilon_neighborhood.cu
index 930dec9445..f82392f974 100644
--- a/cpp/test/prims/epsilon_neighborhood.cu
+++ b/cpp/test/prims/epsilon_neighborhood.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <distance/epsilon_neighborhood.cuh>
 #include <random/make_blobs.cuh>
 #include "test_utils.h"
diff --git a/cpp/test/prims/gram.cu b/cpp/test/prims/gram.cu
index 86d70161d4..19f211eb33 100644
--- a/cpp/test/prims/gram.cu
+++ b/cpp/test/prims/gram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,8 @@
 #include <cuml/matrix/kernelparams.h>
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
-#include <common/host_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
 #include <iostream>
 #include <matrix/grammatrix.cuh>
 #include <matrix/kernelfactory.cuh>
diff --git a/cpp/test/prims/hinge.cu b/cpp/test/prims/hinge.cu
index e6968e479a..292eec2d0b 100644
--- a/cpp/test/prims/hinge.cu
+++ b/cpp/test/prims/hinge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
+#include <cuml/common/cuml_allocator.hpp>
 #include <functions/hinge.cuh>
 #include <raft/random/rng.cuh>
 #include "test_utils.h"
diff --git a/cpp/test/prims/host_buffer.cu b/cpp/test/prims/host_buffer.cu
index 1b74e95a79..5089581d25 100644
--- a/cpp/test/prims/host_buffer.cu
+++ b/cpp/test/prims/host_buffer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,8 @@
 #include <iterator>
 
 #include <gtest/gtest.h>
-#include <common/host_buffer.hpp>
+#include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/host_buffer.hpp>
 
 namespace MLCommon {
 
diff --git a/cpp/test/prims/linearReg.cu b/cpp/test/prims/linearReg.cu
index e91ea5a159..fd984760f1 100644
--- a/cpp/test/prims/linearReg.cu
+++ b/cpp/test/prims/linearReg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
+#include <cuml/common/cuml_allocator.hpp>
 #include <functions/linearReg.cuh>
 #include <raft/random/rng.cuh>
 #include "test_utils.h"
diff --git a/cpp/test/prims/logisticReg.cu b/cpp/test/prims/logisticReg.cu
index b2eb3dc28f..70704f6194 100644
--- a/cpp/test/prims/logisticReg.cu
+++ b/cpp/test/prims/logisticReg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
+#include <cuml/common/cuml_allocator.hpp>
 #include <functions/logisticReg.cuh>
 #include <raft/random/rng.cuh>
 #include "test_utils.h"
diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu
index 01e83d4f0a..cd9b3dd959 100644
--- a/cpp/test/sg/dbscan_test.cu
+++ b/cpp/test/sg/dbscan_test.cu
@@ -30,7 +30,7 @@
 
 #include <test_utils.h>
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 
 namespace ML {
diff --git a/cpp/test/sg/kmeans_test.cu b/cpp/test/sg/kmeans_test.cu
index 34d12b7311..94ba5d8b76 100644
--- a/cpp/test/sg/kmeans_test.cu
+++ b/cpp/test/sg/kmeans_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,9 @@
 #include <vector>
 
 #include <thrust/fill.h>
-#include <common/device_buffer.hpp>
 #include <cuml/cluster/kmeans.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <cuml/cuml.hpp>
 #include <cuml/datasets/make_blobs.hpp>
diff --git a/cpp/test/sg/knn_test.cu b/cpp/test/sg/knn_test.cu
index 53f379f65b..5cd0359b7a 100644
--- a/cpp/test/sg/knn_test.cu
+++ b/cpp/test/sg/knn_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 
 #include <cuml/neighbors/knn.hpp>
 
-#include <common/device_buffer.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 
 namespace ML {
diff --git a/cpp/test/sg/quasi_newton.cu b/cpp/test/sg/quasi_newton.cu
index e0c446735a..cc47221220 100644
--- a/cpp/test/sg/quasi_newton.cu
+++ b/cpp/test/sg/quasi_newton.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <glm/qn/glm_logistic.cuh>
 #include <glm/qn/glm_softmax.cuh>
 #include <glm/qn/qn.cuh>
+#include <raft/handle.hpp>
 #include <vector>
 
 namespace ML {
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index 7dc0b54eb6..9e114cac59 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -24,9 +24,9 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
-#include <common/cumlHandle.hpp>
-#include <common/device_buffer.hpp>
 #include <cub/cub.cuh>
+#include <cuml/common/device_buffer.hpp>
+#include <cuml/common/host_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 #include <cuml/svm/svc.hpp>
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index db5c893779..1efe44d23f 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@
 #include <raft/cudart_utils.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <iostream>
 #include <metrics/scores.cuh>
diff --git a/cpp/test/sg/umap_parametrizable_test.cu b/cpp/test/sg/umap_parametrizable_test.cu
index 235fec9b1d..be852ef268 100644
--- a/cpp/test/sg/umap_parametrizable_test.cu
+++ b/cpp/test/sg/umap_parametrizable_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 #include <cuml/manifold/umapparams.h>
 #include <datasets/digits.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/cuml.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 #include <cuml/manifold/umap.hpp>
diff --git a/cpp/test/sg/umap_test.cu b/cpp/test/sg/umap_test.cu
index 1a1d2c70f6..491c0138c2 100644
--- a/cpp/test/sg/umap_test.cu
+++ b/cpp/test/sg/umap_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 #include <cuml/manifold/umapparams.h>
 #include <datasets/digits.h>
 #include <raft/cudart_utils.h>
-#include <common/device_buffer.hpp>
 #include <cuml/common/cuml_allocator.hpp>
+#include <cuml/common/device_buffer.hpp>
 #include <cuml/common/logger.hpp>
 #include <cuml/cuml.hpp>
 #include <cuml/neighbors/knn.hpp>
diff --git a/wiki/cpp/DEVELOPER_GUIDE.md b/wiki/cpp/DEVELOPER_GUIDE.md
index 76da17ce05..bc88e4da54 100644
--- a/wiki/cpp/DEVELOPER_GUIDE.md
+++ b/wiki/cpp/DEVELOPER_GUIDE.md
@@ -132,6 +132,15 @@ Following the guidelines outlined above will ease the process of "C-wrapping" th
 2. Using templates is fine if those can be instantiated from a specialized C++ function with `extern "C"` linkage.
 3. Expose custom create/load/store/destroy methods, if the model is more complex than an array of parameters (eg: Random Forest). One possible way of working with such exposed states from the C++ layer is shown in a sample repo [here](https://github.com/teju85/managing-state-cuml).
 
+#### C API Header Files
+
+With the exception of `cumlHandle.h|cpp`, all C-API headers and source files end with the suffix `*_api`. Any file ending in `*_api` should not be included from the C++ API. Incorrectly including `cuml_api.h` in the C++ API will generate the error:
+```
+This header is only for the C-API and should not be included from the C++ API.
+```
+
+If this error is shown during compilation, there is an issue with how the `#include` statements have been set up. To debug the issue, run `./build.sh cppdocs` and open the page `cpp/build/html/cuml__api_8h.html` in a browser. This will show which files directly and indirectly include this file. Only files ending in `*_api` or `cumlHandle` should include this header.
+
 ### Stateful C++ API
 This scikit-learn-esq C++ API should always be a wrapper around the stateless C++ API, NEVER the other way around. The design discussion about the right way to expose such a wrapper around `libcuml++.so` is [still going on](https://github.com/rapidsai/cuml/issues/456)  So, stay tuned for more details.
 

From c9c8619e7c99b446e3713bf2823d9f4db81d7cfb Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Fri, 5 Feb 2021 08:38:36 +0100
Subject: [PATCH 21/29] Fixing documentation on SimpleImputer (#3447)

This solve partially #3435 by fixing the documentation of `SimpleImputer`. The next step will be to allow usage of lists for this algorithm.

Authors:
  - Micka (@lowener)

Approvers:
  - Michael Demoret (@mdemoret-nv)

URL: https://github.com/rapidsai/cuml/pull/3447
---
 .../_thirdparty/sklearn/preprocessing/_imputation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
index 454417ec6b..e583ff12ae 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
@@ -202,13 +202,13 @@ class SimpleImputer(_BaseImputer):
 
     Examples
     --------
-    >>> import numpy as np
-    >>> from cuml.impute import SimpleImputer
-    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
-    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+    >>> import cupy as cp
+    >>> from cuml.experimental.preprocessing import SimpleImputer
+    >>> imp_mean = SimpleImputer(missing_values=cp.nan, strategy='mean')
+    >>> imp_mean.fit(cp.asarray([[7, 2, 3], [4, cp.nan, 6], [10, 5, 9]]))
     SimpleImputer()
-    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
-    >>> print(imp_mean.transform(X))
+    >>> X = [[cp.nan, 2, 3], [4, cp.nan, 6], [10, cp.nan, 9]]
+    >>> print(imp_mean.transform(cp.asarray(X)))
     [[ 7.   2.   3. ]
      [ 4.   3.5  6. ]
      [10.   3.5  9. ]]

From c1a744776bb0031bb042b9ffed76cb1e51d24b03 Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Mon, 8 Feb 2021 18:42:20 +0100
Subject: [PATCH 22/29] Update failing MNMG tests (#3348)

Partially answers #3354.
The PR updates some of the failing MNMG tests so that nightly testing finally pass. This PR addresses MNMG RF and MNMG KNN tests failures.

Authors:
  - Victor Lafargue (@viclafargue)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3348
---
 .../cuml/test/dask/test_nearest_neighbors.py  |  2 +-
 python/cuml/test/dask/test_random_forest.py   | 62 +++++++++++++------
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/python/cuml/test/dask/test_nearest_neighbors.py b/python/cuml/test/dask/test_nearest_neighbors.py
index a37d29ce89..cde8a5032f 100644
--- a/python/cuml/test/dask/test_nearest_neighbors.py
+++ b/python/cuml/test/dask/test_nearest_neighbors.py
@@ -249,7 +249,7 @@ def test_one_query_partition(client):
 
     X_train, _ = make_blobs(n_samples=4000,
                             n_features=16,
-                            n_parts=4)
+                            n_parts=8)
 
     X_test, _ = make_blobs(n_samples=200,
                            n_features=16,
diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py
index 3d97f1f381..3f90329146 100644
--- a/python/cuml/test/dask/test_random_forest.py
+++ b/python/cuml/test/dask/test_random_forest.py
@@ -78,10 +78,11 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
 
     # Use CUDA_VISIBLE_DEVICES to control the number of workers
     c = Client(cluster)
+    n_workers = len(c.scheduler_info()['workers'])
 
     try:
 
-        X, y = make_classification(n_samples=10000, n_features=20,
+        X, y = make_classification(n_samples=n_workers * 5000, n_features=20,
                                    n_clusters_per_class=1, n_informative=10,
                                    random_state=123, n_classes=15)
 
@@ -89,7 +90,7 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
         y = y.astype(np.int32)
 
         X_train, X_test, y_train, y_test = \
-            train_test_split(X, y, test_size=1000, random_state=123)
+            train_test_split(X, y, test_size=n_workers * 300, random_state=123)
 
         cu_rf_params = {
             'n_estimators': 25,
@@ -101,7 +102,7 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
         X_train_df, y_train_df = _prep_training_data(c, X_train, y_train,
                                                      partitions_per_worker)
 
-        cuml_mod = cuRFC_mg(**cu_rf_params)
+        cuml_mod = cuRFC_mg(**cu_rf_params, ignore_empty_partitions=True)
         cuml_mod.fit(X_train_df, y_train_df)
         X_test_dask_array = from_array(X_test)
         cuml_preds_gpu = cuml_mod.predict(X_test_dask_array,
@@ -114,7 +115,7 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
         # Refer to issue : https://github.com/rapidsai/cuml/issues/2806 for
         # more information on the threshold value.
 
-        assert acc_score_gpu >= 0.60
+        assert acc_score_gpu >= 0.55
 
     finally:
         c.close()
@@ -124,16 +125,18 @@ def test_rf_classification_multi_class(partitions_per_worker, cluster):
 @pytest.mark.parametrize('partitions_per_worker', [5])
 def test_rf_regression_dask_fil(partitions_per_worker,
                                 dtype, client):
+    n_workers = len(client.scheduler_info()['workers'])
+
     # Use CUDA_VISIBLE_DEVICES to control the number of workers
-    X, y = make_regression(n_samples=10000, n_features=20,
+    X, y = make_regression(n_samples=n_workers * 4000, n_features=20,
                            n_informative=10, random_state=123)
 
     X = X.astype(dtype)
     y = y.astype(dtype)
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                        test_size=1000,
-                                                        random_state=123)
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=n_workers * 100,
+                         random_state=123)
 
     if dtype == np.float64:
         pytest.xfail(reason=" Dask RF does not support np.float64 data")
@@ -158,7 +161,7 @@ def test_rf_regression_dask_fil(partitions_per_worker,
     X_test_df = \
         dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions)
 
-    cuml_mod = cuRFR_mg(**cu_rf_params)
+    cuml_mod = cuRFR_mg(**cu_rf_params, ignore_empty_partitions=True)
     cuml_mod.fit(X_train_df, y_train_df)
 
     cuml_mod_predict = cuml_mod.predict(X_test_df)
@@ -173,8 +176,9 @@ def test_rf_regression_dask_fil(partitions_per_worker,
 @pytest.mark.parametrize('output_class', [True, False])
 def test_rf_classification_dask_array(partitions_per_worker, client,
                                       output_class):
+    n_workers = len(client.scheduler_info()['workers'])
 
-    X, y = make_classification(n_samples=10000, n_features=30,
+    X, y = make_classification(n_samples=n_workers * 2000, n_features=30,
                                n_clusters_per_class=1, n_informative=20,
                                random_state=123, n_classes=2)
 
@@ -182,7 +186,7 @@ def test_rf_classification_dask_array(partitions_per_worker, client,
     y = y.astype(np.int32)
 
     X_train, X_test, y_train, y_test = \
-        train_test_split(X, y, test_size=1000)
+        train_test_split(X, y, test_size=n_workers * 400)
 
     cu_rf_params = {
         'n_estimators': 25,
@@ -207,15 +211,17 @@ def test_rf_classification_dask_array(partitions_per_worker, client,
 
 @pytest.mark.parametrize('partitions_per_worker', [5])
 def test_rf_regression_dask_cpu(partitions_per_worker, client):
-    X, y = make_regression(n_samples=10000, n_features=20,
+    n_workers = len(client.scheduler_info()['workers'])
+
+    X, y = make_regression(n_samples=n_workers * 2000, n_features=20,
                            n_informative=10, random_state=123)
 
     X = X.astype(np.float32)
     y = y.astype(np.float32)
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                        test_size=1000,
-                                                        random_state=123)
+    X_train, X_test, y_train, y_test = \
+        train_test_split(X, y, test_size=n_workers * 400,
+                         random_state=123)
 
     cu_rf_params = {
         'n_estimators': 50,
@@ -250,7 +256,9 @@ def test_rf_regression_dask_cpu(partitions_per_worker, client):
 @pytest.mark.parametrize('partitions_per_worker', [5])
 def test_rf_classification_dask_fil_predict_proba(partitions_per_worker,
                                                   client):
-    X, y = make_classification(n_samples=1000, n_features=30,
+    n_workers = len(client.scheduler_info()['workers'])
+
+    X, y = make_classification(n_samples=n_workers * 1500, n_features=30,
                                n_clusters_per_class=1, n_informative=20,
                                random_state=123, n_classes=2)
 
@@ -258,7 +266,7 @@ def test_rf_classification_dask_fil_predict_proba(partitions_per_worker,
     y = y.astype(np.int32)
 
     X_train, X_test, y_train, y_test = \
-        train_test_split(X, y, test_size=100, random_state=123)
+        train_test_split(X, y, test_size=n_workers * 150, random_state=123)
 
     cu_rf_params = {'n_bins': 16, 'n_streams': 1,
                     'n_estimators': 40, 'max_depth': 16
@@ -291,8 +299,10 @@ def test_rf_classification_dask_fil_predict_proba(partitions_per_worker,
 
 @pytest.mark.parametrize('model_type', ['classification', 'regression'])
 def test_rf_concatenation_dask(client, model_type):
+    n_workers = len(client.scheduler_info()['workers'])
+
     from cuml.fil.fil import TreeliteModel
-    X, y = make_classification(n_samples=1000, n_features=30,
+    X, y = make_classification(n_samples=n_workers * 200, n_features=30,
                                random_state=123, n_classes=2)
 
     X = X.astype(np.float32)
@@ -361,6 +371,11 @@ def test_single_input(client, model_type, ignore_empty_partitions):
 @pytest.mark.parametrize('n_estimators', [5, 10, 20])
 @pytest.mark.parametrize('estimator_type', ['regression', 'classification'])
 def test_rf_get_json(client, estimator_type, max_depth, n_estimators):
+    n_workers = len(client.scheduler_info()['workers'])
+    if n_estimators < n_workers:
+        err_msg = "n_estimators cannot be lower than number of dask workers"
+        pytest.xfail(err_msg)
+
     X, y = make_classification(n_samples=350, n_features=20,
                                n_clusters_per_class=1, n_informative=10,
                                random_state=123, n_classes=2)
@@ -483,6 +498,8 @@ def test_rf_get_combined_model_right_aftter_fit(client, estimator_type):
 @pytest.mark.parametrize('n_estimators', [5, 10, 20])
 @pytest.mark.parametrize('detailed_text', [True, False])
 def test_rf_get_text(client, n_estimators, detailed_text):
+    n_workers = len(client.scheduler_info()['workers'])
+
     X, y = make_classification(n_samples=500, n_features=10,
                                n_clusters_per_class=1, n_informative=5,
                                random_state=94929, n_classes=2)
@@ -491,6 +508,15 @@ def test_rf_get_text(client, n_estimators, detailed_text):
     y = y.astype(np.int32)
     X, y = _prep_training_data(client, X, y, partitions_per_worker=2)
 
+    if n_estimators >= n_workers:
+        cu_rf_mg = cuRFC_mg(n_estimators=n_estimators,
+                            ignore_empty_partitions=True)
+    else:
+        with pytest.raises(ValueError):
+            cu_rf_mg = cuRFC_mg(n_estimators=n_estimators,
+                                ignore_empty_partitions=True)
+        return
+
     cu_rf_mg = cuRFC_mg(n_estimators=n_estimators,
                         ignore_empty_partitions=True)
     cu_rf_mg.fit(X, y)

From ec462e7c9526071a374a95bac2203a93eb183255 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Tue, 9 Feb 2021 15:45:08 -0600
Subject: [PATCH 23/29] Update call to dask client persist (#3474)

Fixes the current CI issue https://github.com/dask/distributed/issues/4492

Authors:
  - Dante Gama Dessavre (@dantegd)
  - Michael Demoret (@mdemoret-nv)

Approvers:
  - John Zedlewski (@JohnZed)
  - William Hicks (@wphicks)
  - @jakirkham
  - Corey J. Nolet (@cjnolet)

URL: https://github.com/rapidsai/cuml/pull/3474
---
 cpp/src/knn/knn_api.cpp            | 12 +++++++++---
 cpp/src_prims/sparse/op/slice.h    |  4 ++--
 cpp/src_prims/sparse/utils.h       |  1 +
 cpp/test/prims/sparse/distance.cu  | 12 +++++++++---
 python/cuml/common/import_utils.py | 10 +++++++++-
 python/cuml/dask/common/utils.py   | 12 ++++++++++--
 6 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/cpp/src/knn/knn_api.cpp b/cpp/src/knn/knn_api.cpp
index 7e5543b457..547fd64629 100644
--- a/cpp/src/knn/knn_api.cpp
+++ b/cpp/src/knn/knn_api.cpp
@@ -26,9 +26,9 @@ extern "C" {
 namespace ML {
 
 /**
- * @brief Flat C API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances.
+ * @brief Flat C API function to perform a brute force knn on a series of input
+ * arrays and combine the results into a single output array for indexes and
+ * distances.
  *
  * @param[in] handle the cuml handle to use
  * @param[in] input an array of pointers to the input arrays
@@ -42,6 +42,12 @@ namespace ML {
  * @param[in] k the number of nearest neighbors to return
  * @param[in] rowMajorIndex is the index array in row major layout?
  * @param[in] rowMajorQuery is the query array in row major layout?
+ * @param[in] metric_type distance metric to use. Specify the metric using the
+ *    integer value of the enum `ML::MetricType`.
+ * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
+ *    is ignored if the metric_type is not Minkowski.
+ * @param[in] expanded should lp-based distances be returned in their expanded
+ *    form (e.g., without raising to the 1/p power).
  */
 cumlError_t knn_search(const cumlHandle_t handle, float **input, int *sizes,
                        int n_params, int D, float *search_items, int n,
diff --git a/cpp/src_prims/sparse/op/slice.h b/cpp/src_prims/sparse/op/slice.h
index 5380f9be04..6d00e54b4a 100644
--- a/cpp/src_prims/sparse/op/slice.h
+++ b/cpp/src_prims/sparse/op/slice.h
@@ -74,8 +74,8 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
 
 /**
  * Slice rows from a CSR, populate column and data arrays
- * @tparam[in] value_idx : data type of CSR index arrays
- * @tparam[in] value_t : data type of CSR data array
+ * @tparam value_idx : data type of CSR index arrays
+ * @tparam value_t : data type of CSR data array
  * @param[in] start_offset : beginning column offset to slice
  * @param[in] stop_offset : ending column offset to slice
  * @param[in] indices : column indices array from input CSR
diff --git a/cpp/src_prims/sparse/utils.h b/cpp/src_prims/sparse/utils.h
index 63578bf1f3..5602f26343 100644
--- a/cpp/src_prims/sparse/utils.h
+++ b/cpp/src_prims/sparse/utils.h
@@ -50,6 +50,7 @@ inline int block_dim(value_idx ncols) {
  * Returns a warp-level mask with 1's for all the threads
  * in the current warp that have the same key.
  * @tparam G
+ * @param init_mask
  * @param key
  * @return
  */
diff --git a/cpp/test/prims/sparse/distance.cu b/cpp/test/prims/sparse/distance.cu
index 4c5af8848d..dcacdf88cd 100644
--- a/cpp/test/prims/sparse/distance.cu
+++ b/cpp/test/prims/sparse/distance.cu
@@ -129,9 +129,15 @@ class SparseDistanceTest
   }
 
   void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
-                            params.out_dists_ref_h.size(),
-                            CompareApprox<value_t>(1e-3)));
+    // skip Hellinger test due to sporadic CI issue
+    // https://github.com/rapidsai/cuml/issues/3477
+    if (params.metric == raft::distance::DistanceType::HellingerExpanded) {
+      GTEST_SKIP();
+    } else {
+      ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+                              params.out_dists_ref_h.size(),
+                              CompareApprox<value_t>(1e-3)));
+    }
   }
 
  protected:
diff --git a/python/cuml/common/import_utils.py b/python/cuml/common/import_utils.py
index 03876abd5c..e546e0d106 100644
--- a/python/cuml/common/import_utils.py
+++ b/python/cuml/common/import_utils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -87,6 +87,14 @@ def has_pytest_benchmark():
         return False
 
 
+def check_min_dask_version(version):
+    try:
+        import dask
+        return LooseVersion(dask.__version__) >= LooseVersion(version)
+    except ImportError:
+        return False
+
+
 def check_min_numba_version(version):
     return LooseVersion(str(numba.__version__)) >= LooseVersion(version)
 
diff --git a/python/cuml/dask/common/utils.py b/python/cuml/dask/common/utils.py
index 90822842c3..8ee713f491 100644
--- a/python/cuml/dask/common/utils.py
+++ b/python/cuml/dask/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 #
 
+import dask
 import logging
 import os
 import numba.cuda
@@ -22,6 +23,7 @@
 from dask.distributed import default_client, wait
 
 from cuml.common import device_of_gpu_matrix
+from cuml.common.import_utils import check_min_dask_version
 
 from asyncio import InvalidStateError
 
@@ -133,7 +135,13 @@ def persist_across_workers(client, objects, workers=None):
     """
     if workers is None:
         workers = client.has_what().keys()  # Default to all workers
-    return client.persist(objects, workers={o: workers for o in objects})
+
+    if check_min_dask_version("2020.12.0"):
+        with dask.annotate(workers=set(workers)):
+            return client.persist(objects)
+
+    else:
+        return client.persist(objects, workers={o: workers for o in objects})
 
 
 def raise_exception_from_futures(futures):

From 8082f3b2afb7c467aba322071462abb0b0301f3b Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Tue, 9 Feb 2021 20:09:18 -0600
Subject: [PATCH 24/29] cuml.experimental SHAP improvements (#3433)

Closes #1739

Addresses most items of #3224

Authors:
  - Dante Gama Dessavre (@dantegd)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3433
---
 cpp/include/cuml/explainer/kernel_shap.hpp    |   2 +-
 .../cuml/explainer/permutation_shap.hpp       |   2 +-
 cpp/src/explainer/kernel_shap.cu              |   2 +-
 cpp/src/explainer/permutation_shap.cu         |  25 +-
 cpp/src/glm/ols.cuh                           |   6 +-
 cpp/src_prims/linalg/lstsq.cuh                |  66 ++-
 python/cuml/experimental/explainer/base.py    | 186 --------
 python/cuml/experimental/explainer/base.pyx   | 419 ++++++++++++++++++
 python/cuml/experimental/explainer/common.py  |  17 +-
 .../experimental/explainer/kernel_shap.pyx    | 124 +++---
 .../explainer/permutation_shap.pyx            | 108 ++---
 python/cuml/test/conftest.py                  |  18 +
 .../test/experimental/test_explainer_base.py  |  12 +-
 .../experimental/test_explainer_common.py     |   4 +-
 .../test_explainer_kernel_shap.py             |  55 ++-
 .../test_explainer_permutation_shap.py        |  22 +-
 16 files changed, 659 insertions(+), 409 deletions(-)
 delete mode 100644 python/cuml/experimental/explainer/base.py
 create mode 100644 python/cuml/experimental/explainer/base.pyx

diff --git a/cpp/include/cuml/explainer/kernel_shap.hpp b/cpp/include/cuml/explainer/kernel_shap.hpp
index 207720fbc4..562162aa87 100644
--- a/cpp/include/cuml/explainer/kernel_shap.hpp
+++ b/cpp/include/cuml/explainer/kernel_shap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cuml/explainer/permutation_shap.hpp b/cpp/include/cuml/explainer/permutation_shap.hpp
index 1165f3423f..1f4f6c78a1 100644
--- a/cpp/include/cuml/explainer/permutation_shap.hpp
+++ b/cpp/include/cuml/explainer/permutation_shap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/explainer/kernel_shap.cu b/cpp/src/explainer/kernel_shap.cu
index 9efcc9bde7..14dca81b4c 100644
--- a/cpp/src/explainer/kernel_shap.cu
+++ b/cpp/src/explainer/kernel_shap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/explainer/permutation_shap.cu b/cpp/src/explainer/permutation_shap.cu
index f89a8d29e4..bca354953d 100644
--- a/cpp/src/explainer/permutation_shap.cu
+++ b/cpp/src/explainer/permutation_shap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,10 @@ __global__ void _fused_tile_scatter_pe(DataT* dataset, const DataT* background,
                                        bool row_major) {
   // kernel that actually does the scattering as described in the
   // descriptions of `permutation_dataset` and `shap_main_effect_dataset`
+  // parameter sc_size allows us to generate both the permuation_shap_dataset
+  // and the main_effect_dataset with the same kernel, since they do the
+  // scattering in the same manner, its just the "height" of the columns
+  // generated from values that is different.
   IdxT tid = threadIdx.x + blockDim.x * blockIdx.x;
 
   if (tid < ncols * nrows_dataset) {
@@ -34,10 +38,19 @@ __global__ void _fused_tile_scatter_pe(DataT* dataset, const DataT* background,
 
     if (row_major) {
       row = tid / ncols;
+
+      // we calculate the first row where the entry of dataset will be
+      // entered into background depending on its place in the index array
       col = idx[tid % ncols];
       start = ((tid % ncols) + 1) * nrows_background;
+
+      // each entry of the dataset will be input the same number of times
+      // to the matrix, controled by the sc_size parameter
       end = start + sc_size * nrows_background;
 
+      // now we just need to check if this thread is between start and end
+      // if it is then the value should be based on the observation obs
+      // otherwise on the background dataset
       if ((start <= row && row < end)) {
         dataset[row * ncols + col] = obs[col];
       } else {
@@ -49,7 +62,11 @@ __global__ void _fused_tile_scatter_pe(DataT* dataset, const DataT* background,
       col = tid / nrows_dataset;
       row = tid % nrows_dataset;
 
+      // main difference between row and col major is how do we calculate
+      // the end and start and which row corresponds to each thread
       start = nrows_background + idx[col] * nrows_background;
+
+      // calculation of end position is identical
       end = start + sc_size * nrows_background;
 
       if ((start <= row && row < end)) {
@@ -78,6 +95,8 @@ void permutation_shap_dataset_impl(const raft::handle_t& handle, DataT* dataset,
 
   IdxT nblks = (nrows_dataset * ncols + nthreads - 1) / nthreads;
 
+  // each thread calculates a single element
+  // for the permutation shap dataset we need the sc_size parameter to be ncols
   _fused_tile_scatter_pe<<<nblks, nthreads, 0, stream>>>(
     dataset, background, nrows_dataset, ncols, row, idx, nrows_background,
     ncols, row_major);
@@ -107,13 +126,15 @@ void shap_main_effect_dataset_impl(const raft::handle_t& handle, DataT* dataset,
   const auto& handle_impl = handle;
   cudaStream_t stream = handle_impl.get_stream();
 
-  // we calculate the number of rows in the dataset
+  // we calculate the number of elements in the dataset
   IdxT total_num_elements = (nrows_bg * ncols + nrows_bg) * ncols;
 
   constexpr IdxT nthreads = 512;
 
   IdxT nblks = (total_num_elements + nthreads - 1) / nthreads;
 
+  // each thread calculates a single element
+  // for the permutation shap dataset we need the sc_size parameter to be 1
   _fused_tile_scatter_pe<<<nblks, nthreads, 0, stream>>>(
     dataset, background, total_num_elements / ncols, ncols, row, idx, nrows_bg,
     1, row_major);
diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh
index f84ea38861..afd7b8f673 100644
--- a/cpp/src/glm/ols.cuh
+++ b/cpp/src/glm/ols.cuh
@@ -75,10 +75,8 @@ void olsFit(const raft::handle_t &handle, math_t *input, int n_rows, int n_cols,
                    fit_intercept, normalize, stream);
   }
 
-  if (algo == 0 || n_cols == 1) {
-    LinAlg::lstsqSVD(handle, input, n_rows, n_cols, labels, coef, stream);
-  } else if (algo == 1) {
-    LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream);
+  if (algo == 0 || algo == 1) {
+    LinAlg::lstsq(handle, input, n_rows, n_cols, labels, coef, algo, stream);
   } else if (algo == 2) {
     LinAlg::lstsqQR(input, n_rows, n_cols, labels, coef, cusolver_handle,
                     cublas_handle, allocator, stream);
diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh
index ccd205d7a0..09c4793e00 100644
--- a/cpp/src_prims/linalg/lstsq.cuh
+++ b/cpp/src_prims/linalg/lstsq.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/linalg/gemv.h>
@@ -31,65 +32,48 @@
 #include <raft/matrix/matrix.cuh>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace MLCommon {
 namespace LinAlg {
 
 template <typename math_t>
-void lstsqSVD(const raft::handle_t &handle, math_t *A, int n_rows, int n_cols,
-              math_t *b, math_t *w, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void lstsq(const raft::handle_t &handle, math_t *A, int n_rows, int n_cols,
+           math_t *b, math_t *w, int algo, cudaStream_t stream) {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
-  ASSERT(n_cols > 0, "lstsq: number of columns cannot be less than one");
   ASSERT(n_rows > 1, "lstsq: number of rows cannot be less than two");
 
   int U_len = n_rows * n_cols;
   int V_len = n_cols * n_cols;
 
-  raft::mr::device::buffer<math_t> S(allocator, stream, n_cols);
-  raft::mr::device::buffer<math_t> V(allocator, stream, V_len);
-  raft::mr::device::buffer<math_t> U(allocator, stream, U_len);
-  raft::mr::device::buffer<math_t> UT_b(allocator, stream, n_rows);
+  rmm::device_uvector<math_t> S(n_cols, stream);
+  rmm::device_uvector<math_t> V(V_len, stream);
+  rmm::device_uvector<math_t> U(U_len, stream);
 
-  raft::linalg::svdQR(handle, A, n_rows, n_cols, S.data(), U.data(), V.data(),
-                      true, true, true, stream);
+  // we use a temporary vector to avoid doing re-using w in the last step, the
+  // gemv, which could cause a very sporadic race condition in Pascal and
+  // Turing GPUs that caused it to give the wrong results. Details:
+  // https://github.com/rapidsai/cuml/issues/1739
+  rmm::device_uvector<math_t> tmp_vector(n_cols, stream);
 
-  raft::linalg::gemv(handle, U.data(), n_rows, n_cols, b, w, true, stream);
+  if (algo == 0 || n_cols == 1) {
+    raft::linalg::svdQR(handle, A, n_rows, n_cols, S.data(), U.data(), V.data(),
+                        true, true, true, stream);
+  } else if (algo == 1) {
+    raft::linalg::svdEig(handle, A, n_rows, n_cols, S.data(), U.data(),
+                         V.data(), true, stream);
+  }
 
-  raft::matrix::matrixVectorBinaryDivSkipZero(w, S.data(), 1, n_cols, false,
-                                              true, stream);
+  raft::linalg::gemv(handle, U.data(), n_rows, n_cols, b, tmp_vector.data(),
+                     true, stream);
 
-  raft::linalg::gemv(handle, V.data(), n_cols, n_cols, w, w, false, stream);
-}
-
-template <typename math_t>
-void lstsqEig(const raft::handle_t &handle, math_t *A, int n_rows, int n_cols,
-              math_t *b, math_t *w, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
-
-  ASSERT(n_cols > 1, "lstsq: number of columns cannot be less than two");
-  ASSERT(n_rows > 1, "lstsq: number of rows cannot be less than two");
-
-  int U_len = n_rows * n_cols;
-  int V_len = n_cols * n_cols;
-
-  raft::mr::device::buffer<math_t> S(allocator, stream, n_cols);
-  raft::mr::device::buffer<math_t> V(allocator, stream, V_len);
-  raft::mr::device::buffer<math_t> U(allocator, stream, U_len);
-
-  raft::linalg::svdEig(handle, A, n_rows, n_cols, S.data(), U.data(), V.data(),
-                       true, stream);
-
-  raft::linalg::gemv(handle, U.data(), n_rows, n_cols, b, w, true, stream);
-
-  raft::matrix::matrixVectorBinaryDivSkipZero(w, S.data(), 1, n_cols, false,
-                                              true, stream);
+  raft::matrix::matrixVectorBinaryDivSkipZero(tmp_vector.data(), S.data(), 1,
+                                              n_cols, false, true, stream);
 
-  raft::linalg::gemv(handle, V.data(), n_cols, n_cols, w, w, false, stream);
+  raft::linalg::gemv(handle, V.data(), n_cols, n_cols, tmp_vector.data(), w,
+                     false, stream);
 }
 
 template <typename math_t>
diff --git a/python/cuml/experimental/explainer/base.py b/python/cuml/experimental/explainer/base.py
deleted file mode 100644
index c83a81e211..0000000000
--- a/python/cuml/experimental/explainer/base.py
+++ /dev/null
@@ -1,186 +0,0 @@
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import cudf
-import cupy as cp
-import numpy as np
-import pandas
-
-import cuml.common.logger as logger
-from cuml.experimental.explainer.common import get_dtype_from_model_func
-from cuml.experimental.explainer.common import get_handle_from_cuml_model_func
-from cuml.experimental.explainer.common import get_link_fn_from_str_or_fn
-from cuml.experimental.explainer.common import get_tag_from_model_func
-from cuml.experimental.explainer.common import model_func_call
-from cuml.common.input_utils import input_to_cupy_array
-
-
-class SHAPBase():
-    """
-    Base class for SHAP based explainers.
-
-    Parameters
-    ----------
-    model : function
-        Function that takes a matrix of samples (n_samples, n_features) and
-        computes the output for those samples with shape (n_samples). Function
-        must use either CuPy or NumPy arrays as input/output.
-    data : Dense matrix containing floats or doubles.
-        Background dataset. Dense arrays are supported.
-    order : 'F', 'C' or None (default = None)
-        Set to override detection of row ('C') or column ('F') major order,
-        if None it will be attempted to be inferred from model.
-    order_default : 'F' or 'C' (default = 'C')
-        Used when `order` is None. If the order cannot be inferred from the
-        model, then order is set to `order_default`.
-    link : function or str (default = 'identity')
-        The link function used to map between the output units of the
-        model and the SHAP value units.
-    random_state: int, RandomState instance or None (default = None)
-        Seed for the random number generator for dataset creation.
-    is_gpu_model : bool or None (default = None)
-        If None Explainer will try to infer whether `model` can take GPU data
-        (as CuPy arrays), otherwise it will use NumPy arrays to call `model`.
-        Set to True to force the explainer to use GPU data,  set to False to
-        force the Explainer to use NumPy data.
-    handle : cuml.raft.common.handle
-        Specifies the handle that holds internal CUDA state for
-        computations in this model. Most importantly, this specifies the CUDA
-        stream that will be used for the model's computations, so users can
-        run different models concurrently in different streams by creating
-        handles in several streams.
-        If it is None, a new one is created.
-    dtype : np.float32 or np.float64 (default = None)
-        Parameter to specify the precision of data to generate to call the
-        model. If not specified, the explainer will try to get the dtype
-        of the model, if it cannot be queried, then it will default to
-        np.float32.
-    output_type : 'cupy' or 'numpy' (default = None)
-        Parameter to specify the type of data to output.
-        If not specified, the explainer will try to see if model is gpu based,
-        if so it will be set to `cupy`, otherwise it will be set to `numpy`.
-        For compatibility with SHAP's graphing libraries, specify `numpy`.
-
-    """
-
-    def __init__(self,
-                 *,
-                 model,
-                 background,
-                 order=None,
-                 order_default='C',
-                 link='identity',
-                 verbose=False,
-                 random_state=None,
-                 is_gpu_model=None,
-                 handle=None,
-                 dtype=None,
-                 output_type=None):
-
-        if verbose is True:
-            self.verbose = logger.level_debug
-        elif verbose is False:
-            self.verbose = logger.level_error
-        else:
-            self.verbose = verbose
-
-        if handle is None:
-            self.handle = get_handle_from_cuml_model_func(model,
-                                                          create_new=True)
-        else:
-            self.handle = handle
-
-        if order is None:
-            self.order = get_tag_from_model_func(func=model,
-                                                 tag='preferred_input_order',
-                                                 default=order_default)
-        else:
-            self.order = order
-
-        self.link = link
-        self.link_fn = get_link_fn_from_str_or_fn(link)
-        self.model = model
-        if is_gpu_model is None:
-            # todo (dgd): when sparse support is added, use this tag to see if
-            # model can accept sparse data
-            self.is_gpu_model = \
-                get_tag_from_model_func(func=model,
-                                        tag='X_types_gpu',
-                                        default=None) is not None
-        else:
-            self.is_gpu_model = is_gpu_model
-
-        # we are defaulting to numpy for now for compatibility
-        if output_type is None:
-            # self.output_type = 'cupy' if self.is_gpu_model else 'numpy'
-            self.output_type = 'numpy'
-        else:
-            self.output_type = output_type
-
-        # if not dtype is specified, we try to get it from the model
-        if dtype is None:
-            self.dtype = get_dtype_from_model_func(func=model,
-                                                   default=np.float32)
-        else:
-            if dtype in [np.float32, np.float64]:
-                self.dtype = np.dtype(dtype)
-            raise ValueError("dtype must be either np.float32 or np.float64")
-
-        self.background, self.N, self.M, _ = \
-            input_to_cupy_array(background, order=self.order,
-                                convert_to_dtype=self.dtype)
-
-        self.random_state = random_state
-
-        if isinstance(background,
-                      pandas.DataFrame) or isinstance(background,
-                                                      cudf.DataFrame):
-            self.feature_names = background.columns.to_list()
-        else:
-            self.feature_names = [None for _ in range(len(background))]
-
-        # evaluate the model in background to get the expected_value
-        self.expected_value = self.link_fn(
-            cp.mean(
-                model_func_call(X=self.background,
-                                model_func=self.model,
-                                gpu_model=self.is_gpu_model),
-                axis=0
-            )
-        )
-
-        # D tells us the dimension of the model. For example, `predict_proba`
-        # functions typically return n values for n classes as opposed to
-        # 1 valued for a typical `predict`
-        if len(self.expected_value.shape) == 0:
-            self.D = 1
-        else:
-            self.D = self.expected_value.shape[0]
diff --git a/python/cuml/experimental/explainer/base.pyx b/python/cuml/experimental/explainer/base.pyx
new file mode 100644
index 0000000000..dfa4b449b2
--- /dev/null
+++ b/python/cuml/experimental/explainer/base.pyx
@@ -0,0 +1,419 @@
+#
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import cudf
+import cuml.internals
+import cupy as cp
+import numpy as np
+import pandas
+
+import cuml.common.logger as logger
+from cuml.common.import_utils import has_shap
+from cuml.common.input_utils import input_to_cupy_array
+from cuml.common.input_utils import input_to_host_array
+from cuml.common.logger import debug
+from cuml.common.logger import warn
+from cuml.experimental.explainer.common import get_dtype_from_model_func
+from cuml.experimental.explainer.common import get_handle_from_cuml_model_func
+from cuml.experimental.explainer.common import get_link_fn_from_str_or_fn
+from cuml.experimental.explainer.common import get_tag_from_model_func
+from cuml.experimental.explainer.common import model_func_call
+from cuml.experimental.explainer.common import output_list_shap_values
+
+from cuml.raft.common.handle cimport handle_t
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+
+
+cdef extern from "cuml/explainer/permutation_shap.hpp" namespace "ML":
+
+    void shap_main_effect_dataset "ML::Explainer::shap_main_effect_dataset"(
+        const handle_t& handle,
+        float* dataset,
+        const float* background,
+        int nrows,
+        int ncols,
+        const float* row,
+        int* idx,
+        bool rowMajor) except +
+
+    void shap_main_effect_dataset "ML::Explainer::shap_main_effect_dataset"(
+            const handle_t& handle,
+            double* dataset,
+            const double* background,
+            int nrows,
+            int ncols,
+            const double* row,
+            int* idx,
+            bool rowMajor) except +
+
+
+class SHAPBase():
+    """
+    Base class for SHAP based explainers.
+
+    Parameters
+    ----------
+    model : function
+        Function that takes a matrix of samples (n_samples, n_features) and
+        computes the output for those samples with shape (n_samples). Function
+        must use either CuPy or NumPy arrays as input/output.
+    data : Dense matrix containing floats or doubles.
+        Background dataset. Dense arrays are supported.
+    order : 'F', 'C' or None (default = None)
+        Set to override detection of row ('C') or column ('F') major order,
+        if None it will be attempted to be inferred from model.
+    order_default : 'F' or 'C' (default = 'C')
+        Used when `order` is None. If the order cannot be inferred from the
+        model, then order is set to `order_default`.
+    link : function or str (default = 'identity')
+        The link function used to map between the output units of the
+        model and the SHAP value units.
+    random_state: int, RandomState instance or None (default = None)
+        Seed for the random number generator for dataset creation.
+    is_gpu_model : bool or None (default = None)
+        If None Explainer will try to infer whether `model` can take GPU data
+        (as CuPy arrays), otherwise it will use NumPy arrays to call `model`.
+        Set to True to force the explainer to use GPU data,  set to False to
+        force the Explainer to use NumPy data.
+    handle : cuml.raft.common.handle
+        Specifies the handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the CUDA
+        stream that will be used for the model's computations, so users can
+        run different models concurrently in different streams by creating
+        handles in several streams.
+        If it is None, a new one is created.
+    dtype : np.float32 or np.float64 (default = None)
+        Parameter to specify the precision of data to generate to call the
+        model. If not specified, the explainer will try to get the dtype
+        of the model, if it cannot be queried, then it will default to
+        np.float32.
+    output_type : 'cupy' or 'numpy' (default = None)
+        Parameter to specify the type of data to output.
+        If not specified, the explainer will try to see if model is gpu based,
+        if so it will be set to `cupy`, otherwise it will be set to `numpy`.
+        For compatibility with SHAP's graphing libraries, specify `numpy`.
+
+    """
+
+    def __init__(self,
+                 *,
+                 model,
+                 background,
+                 order=None,
+                 order_default='C',
+                 link='identity',
+                 verbose=False,
+                 random_state=None,
+                 is_gpu_model=None,
+                 handle=None,
+                 dtype=None,
+                 output_type=None):
+
+        if verbose is True:
+            self.verbose = logger.level_debug
+        elif verbose is False:
+            self.verbose = logger.level_error
+        else:
+            self.verbose = verbose
+
+        if self.verbose >= logger.level_debug:
+            self.time_performance = True
+        else:
+            self.time_performance = False
+
+        if handle is None:
+            self.handle = get_handle_from_cuml_model_func(model,
+                                                          create_new=True)
+        else:
+            self.handle = handle
+
+        if order is None:
+            self.order = get_tag_from_model_func(func=model,
+                                                 tag='preferred_input_order',
+                                                 default=order_default)
+        else:
+            self.order = order
+
+        self.link = link
+        self.link_fn = get_link_fn_from_str_or_fn(link)
+        self.model = model
+        if is_gpu_model is None:
+            # todo (dgd): when sparse support is added, use this tag to see if
+            # model can accept sparse data
+            self.is_gpu_model = \
+                get_tag_from_model_func(func=model,
+                                        tag='X_types_gpu',
+                                        default=None) is not None
+        else:
+            self.is_gpu_model = is_gpu_model
+
+        # we are defaulting to numpy for now for compatibility
+        if output_type is None:
+            self.output_type = 'numpy'
+        else:
+            self.output_type = output_type
+
+        # if not dtype is specified, we try to get it from the model
+        if dtype is None:
+            self.dtype = get_dtype_from_model_func(func=model,
+                                                   default=np.float32)
+        else:
+            self.dtype = np.dtype(dtype)
+            if dtype not in [np.float32, np.float64]:
+                raise ValueError("dtype must be either np.float32 or "
+                                 "np.float64.")
+
+        self.background, self.nrows, self.ncols, _ = \
+            input_to_cupy_array(background, order=self.order,
+                                convert_to_dtype=self.dtype)
+
+        self.random_state = random_state
+
+        if isinstance(background,
+                      pandas.DataFrame) or isinstance(background,
+                                                      cudf.DataFrame):
+            self.feature_names = background.columns.to_list()
+        else:
+            self.feature_names = [None for _ in range(len(background))]
+
+        # evaluate the model in background to get the expected_value
+        self._expected_value = self.link_fn(
+            cp.mean(
+                model_func_call(X=self.background,
+                                model_func=self.model,
+                                gpu_model=self.is_gpu_model),
+                axis=0
+            )
+        )
+
+        # public attribute saved as NumPy for compatibility with the legacy
+        # SHAP potting functions
+        self.expected_value = cp.asnumpy(self._expected_value)
+
+        # Calculate the dimension of the model. For example, `predict_proba`
+        # functions typically return n values for n classes as opposed to
+        # 1 valued for a typical `predict`
+        if len(self._expected_value.shape) == 0:
+            self.model_dimensions = 1
+            self.expected_value = float(self.expected_value)
+        else:
+            self.model_dimensions = self._expected_value.shape[0]
+
+        self._reset_timers()
+
+    def _explain(self,
+                 X,
+                 testing=False,
+                 synth_data_shape=None,
+                 free_synth_data=True,
+                 return_as_list=True,
+                 **kwargs):
+        """
+        Function that calls inheriting explainers _explain_single_observation
+        in each row of X.
+
+        Parameters
+        ----------
+        X : Dense matrix containing floats or doubles.
+            Acceptable formats: CUDA array interface compliant objects like
+            CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas
+            DataFrame/Series.
+        testing : bool (default: False)
+            Flag to control random behaviors used by some explainers for
+            running pytests. Might be removed in a future version, meant only
+            for testing code.
+        synth_data_shape : tuple (default: None)
+            Shape of temporary data needed by inheriting explainer.
+        free_synth_data : bool (default: True)
+            Whether to free temporary memory after the call. Useful in case a
+            workflow requires multiple calls to shap_values with small data
+            as opposed to fewer calls with bigger data.
+        **kwargs: dictionary
+            Specific parameters that the _explain_single_observation of
+            inheriting classes need.
+
+        Returns
+        -------
+        shap_values : array
+            Aray with the shap values, using cuml.internals output type logic.
+
+        """
+        self._reset_timers()
+
+        X = input_to_cupy_array(X,
+                                order=self.order,
+                                convert_to_dtype=self.dtype)[0]
+
+        if X.ndim == 1:
+            X = X.reshape((1, self.ncols))
+
+        # shap_values is a list so we can return a list in the case that
+        # model is a multidimensional-output function
+        shap_values = []
+
+        for i in range(self.model_dimensions):
+            shap_values.append(cp.zeros(X.shape, dtype=self.dtype))
+
+        # Allocate synthetic dataset array once for multiple explanations
+        if getattr(self, "synth_data", None) is None and synth_data_shape \
+                is not None:
+            self._synth_data = cp.zeros(
+                shape=synth_data_shape,
+                dtype=self.dtype,
+                order=self.order
+            )
+
+        # Explain each observation
+        for idx, x in enumerate(X):
+            # use mutability of lists and cupy arrays to get all shap values
+            self._explain_single_observation(
+                shap_values=shap_values,
+                row=x.reshape(1, self.ncols),
+                idx=idx,
+                **kwargs
+            )
+
+        if free_synth_data and getattr(self, "synth_data", None) is not None:
+            del(self._synth_data)
+
+        if return_as_list:
+            shap_values = output_list_shap_values(
+                X=shap_values,
+                dimensions=self.model_dimensions,
+                output_type=self.output_type
+            )
+
+        debug(self._get_timers_str())
+
+        return shap_values
+
+    def __call__(self,
+                 X,
+                 main_effects=False,
+                 **kwargs):
+
+        if not has_shap("0.37"):
+            raise ImportError("SHAP >= 0.37 was not found, please install it "
+                              " or use the explainer.shap_values function "
+                              "instead. ")
+        else:
+            warn("Support for the new API is in experimental state, tested "
+                 "with SHAP 0.37, but changes in further versions could "
+                 "affect its functioning. The functions explainer.shap_values "
+                 " and explainer.main_effects are the stable calls currently.")
+
+            from shap import Explanation
+
+        shap_values = self.shap_values(X,
+                                       as_list=False,
+                                       **kwargs)
+
+        # reshaping of arrays to match SHAP's behavior for building
+        # Explanation objects
+        if self.model_dimensions > 1:
+            shap_values == cp.asnumpy(cp.array(shap_values)).reshape(
+                len(X), X.shape[1], self.model_dimensions
+            )
+            base_values = np.tile(self.expected_value, (len(X), 1))
+        else:
+            shap_values = cp.asnumpy(shap_values[0])
+            base_values = np.tile(self.expected_value, len(X))
+
+        if main_effects:
+            main_effect_values = self.main_effects(X)
+        else:
+            main_effect_values = None
+
+        out = Explanation(
+            values=shap_values,
+            base_values=base_values,
+            data=input_to_host_array(X).array,
+            feature_names=self.feature_names,
+            main_effects=main_effect_values
+        )
+
+        return out
+
+    def main_effects(self,
+                     X):
+        """
+        A utility method to compute the main effects of a model.
+        """
+
+        main_effects = []
+        for idx, x in enumerate(X):
+            main_effects.append(self._calculate_main_effects(x))
+
+        return main_effects
+
+    def _calculate_main_effects(self,
+                                main_effect_values,
+                                row,
+                                inds=None):
+        if inds is None:
+            inds = cp.arange(len(self.masker), dtype=np.float32)
+
+        masked_inputs = cp.empty(
+            shape=((self.nrows * self.ncols + self.nrows), self.ncols),
+            dtype=self.dtype,
+            order=self.masker.order
+        )
+
+        cdef handle_t* handle_ = \
+            <handle_t*><size_t>self.handle.getHandle()
+        cdef uintptr_t row_ptr, bg_ptr, idx_ptr, masked_ptr
+
+        masked_ptr = masked_inputs.__cuda_array_interface__['data'][0]
+
+        bg_ptr = self.masker.ptr
+        row_ptr = row.ptr
+        idx_ptr = inds.__cuda_array_interface__['data'][0]
+        row_major = self.masker.order == "C"
+
+        if self.masker.order.dtype == cp.float32:
+            shap_main_effect_dataset(handle_[0],
+                                     <float*> masked_ptr,
+                                     <float*> bg_ptr,
+                                     <int> self.nrows,
+                                     <int> self.ncols,
+                                     <float*> row_ptr,
+                                     <int*> idx_ptr,
+                                     <bool> row_major)
+        else:
+            shap_main_effect_dataset(handle_[0],
+                                     <double*> masked_ptr,
+                                     <double*> bg_ptr,
+                                     <int> self.nrows,
+                                     <int> self.ncols,
+                                     <double*> row_ptr,
+                                     <int*> idx_ptr,
+                                     <bool> row_major)
+
+        self.handle.sync()
+
+        main_effects = model_func_call(masked_inputs) - self._expected_value
+        return main_effects
+
+    def _reset_timers(self):
+        self.total_time = 0
+        self.model_call_time = 0
+
+    def _get_timers_str(self):
+        res_str = "Time spent by category:\n"
+        res_str += "Total time: {}".format(self.total_time)
+        res_str += "Time spent in model calls {}:".format(self.model_call_time)
+        return res_str
diff --git a/python/cuml/experimental/explainer/common.py b/python/cuml/experimental/explainer/common.py
index 443be6470d..e68a5ec4e2 100644
--- a/python/cuml/experimental/explainer/common.py
+++ b/python/cuml/experimental/explainer/common.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 import cuml
 import cupy as cp
 
+from cuml.common.input_utils import input_to_cupy_array
+
 
 def get_tag_from_model_func(func, tag, default=None):
     """
@@ -106,12 +108,13 @@ def model_func_call(X,
     Returns the results as CuPy arrays.
     """
     if gpu_model:
-        y = cp.asarray(model_func(X))
+        y = input_to_cupy_array(X=model_func(X),
+                                order='K').array
     else:
         try:
-            y = cp.array(model_func(
+            y = input_to_cupy_array(model_func(
                 cp.asnumpy(X)
-            ))
+            )).array
         except TypeError:
             raise TypeError('Explainer can only explain models that can '
                             'take GPU data or NumPy arrays as input.')
@@ -146,13 +149,15 @@ def get_link_fn_from_str_or_fn(link):
     return link_fn
 
 
-# temporary function while explainers adopt decorators and cumlarray descriptor
 def output_list_shap_values(X, dimensions, output_type):
     if output_type == 'cupy':
         if dimensions == 1:
             return X[0]
         else:
-            return X
+            res = []
+            for x in X:
+                res.append(x)
+            return res
     else:
         if dimensions == 1:
             return cp.asnumpy(X[0])
diff --git a/python/cuml/experimental/explainer/kernel_shap.pyx b/python/cuml/experimental/explainer/kernel_shap.pyx
index 06543dfe31..1f7b51b509 100644
--- a/python/cuml/experimental/explainer/kernel_shap.pyx
+++ b/python/cuml/experimental/explainer/kernel_shap.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
 # limitations under the License.
 #
 
+import cuml.common.logger as logger
 import cupy as cp
 import numpy as np
+import time
 
 from cuml.common.import_utils import has_shap
 from cuml.common.import_utils import has_sklearn
 from cuml.common.input_utils import input_to_cupy_array
-from cuml.common.logger import warn
 from cuml.experimental.explainer.base import SHAPBase
 from cuml.experimental.explainer.common import get_cai_ptr
 from cuml.experimental.explainer.common import model_func_call
@@ -216,8 +217,8 @@ class KernelExplainer(SHAPBase):
         max_samples = 2 ** 32
 
         # restricting maximum number of samples
-        if self.M <= 32:
-            max_samples = 2 ** self.M - 2
+        if self.ncols <= 32:
+            max_samples = 2 ** self.ncols - 2
 
             # if the user requested more samples than there are subsets in the
             # _powerset, we set nsamples to max_samples
@@ -229,30 +230,31 @@ class KernelExplainer(SHAPBase):
         self.ratio_evaluated = self.nsamples / max_samples
 
         self.nsamples_exact, self.nsamples_random, self.randind = \
-            _get_number_of_exact_random_samples(ncols=self.M,
+            _get_number_of_exact_random_samples(ncols=self.ncols,
                                                 nsamples=self.nsamples)
 
         # using numpy for powerset and shapley kernel weight calculations
         # cost is incurred only once, and generally we only generate
         # very few samples of the powerset if M is big.
-        mat, weight = _powerset(self.M, self.randind, self.nsamples_exact,
+        mat, weight = _powerset(self.ncols, self.randind, self.nsamples_exact,
                                 full_powerset=(self.nsamples_random == 0),
                                 dtype=self.dtype)
 
         # Store the mask and weights as device arrays
         # Mask dtype can be independent of Explainer dtype, since model
         # is not called on it.
-        self._mask = cp.zeros((self.nsamples, self.M), dtype=np.float32)
+        self._mask = cp.zeros((self.nsamples, self.ncols), dtype=np.float32)
         self._mask[:self.nsamples_exact] = cp.array(mat)
 
         self._weights = cp.ones(self.nsamples, dtype=self.dtype)
         self._weights[:self.nsamples_exact] = cp.array(weight)
 
-        self._synth_data = None
+        self._reset_timers()
 
     def shap_values(self,
                     X,
-                    l1_reg='auto'):
+                    l1_reg='auto',
+                    as_list=True):
         """
         Interface to estimate the SHAP values for a set of samples.
         Corresponds to the SHAP package's legacy interface, and is our main
@@ -266,73 +268,46 @@ class KernelExplainer(SHAPBase):
             DataFrame/Series.
         l1_reg : str (default: 'auto')
             The l1 regularization to use for feature selection.
+        as_list : bool (default = True)
+            Set to True to return a list of arrays for multi-dimensional
+            models (like predict_proba functions) to match the SHAP package
+            behavior. Set to False to return them as an array of arrays.
 
         Returns
         -------
-        array or list
+        values : array or list
 
         """
         return self._explain(X,
+                             synth_data_shape=(self.nrows * self.nsamples,
+                                               self.ncols),
+                             return_as_list=as_list,
                              l1_reg=l1_reg)
 
-    def _explain(self,
-                 X,
-                 nsamples=None,
-                 l1_reg='auto'):
-        X = input_to_cupy_array(X, order='C', convert_to_dtype=self.dtype)[0]
-
-        if X.ndim == 1:
-            X = X.reshape((1, self.M))
-
-        # shap_values is a list so we can return a list in the case that
-        # model is a multidimensional-output function
-        shap_values = []
-
-        for i in range(self.D):
-            shap_values.append(cp.zeros(X.shape, dtype=self.dtype))
-
-        # Allocate synthetic dataset array once for multiple explanations
-        if self._synth_data is None:
-            self._synth_data = cp.zeros(
-                shape=(self.N * self.nsamples, self.M),
-                dtype=self.dtype,
-                order=self.order
-            )
-
-        # Explain each observation
-        for idx, x in enumerate(X):
-            # use mutability of lists and cupy arrays to get all shap values
-            self._explain_single_observation(
-                shap_values,
-                x.reshape(1, self.M),
-                l1_reg,
-                idx
-            )
-
-        del(self._synth_data)
-
-        return output_list_shap_values(shap_values, self.D, self.output_type)
-
     def _explain_single_observation(self,
                                     shap_values,
                                     row,
-                                    l1_reg,
-                                    idx):
+                                    idx,
+                                    l1_reg):
+        total_timer = time.time()
         # Call the model to get the value f(row)
         fx = cp.array(
             model_func_call(X=row,
                             model_func=self.model,
                             gpu_model=self.is_gpu_model))
 
+        self.model_call_time = \
+            self.model_call_time + (time.time() - total_timer)
+
         self._mask[self.nsamples_exact:self.nsamples] = \
-            cp.zeros((self.nsamples_random, self.M), dtype=cp.float32)
+            cp.zeros((self.nsamples_random, self.ncols), dtype=cp.float32)
 
         # If we need sampled rows, then we call the function that generates
         # the samples array with how many samples each row will have
         # and its corresponding weight
         if self.nsamples_random > 0:
             samples, self._weights[self.nsamples_exact:self.nsamples] = \
-                _generate_nsamples_weights(self.M,
+                _generate_nsamples_weights(self.ncols,
                                            self.nsamples,
                                            self.nsamples_exact,
                                            int(self.nsamples_random / 2),
@@ -391,24 +366,30 @@ class KernelExplainer(SHAPBase):
                 <int> maxsample,
                 <uint64_t> self.random_state)
 
-        # kept while in experimental phase. It is not needed for cuml
+        # kept while in experimental namespace. It is not needed for cuml
         # models, but for other GPU models it is
         self.handle.sync()
 
+        model_timer = time.time()
         # evaluate model on combinations
         y = model_func_call(X=self._synth_data,
                             model_func=self.model,
                             gpu_model=self.is_gpu_model)
 
-        for i in range(self.D):
-            if self.D == 1:
-                y_hat = y - self.expected_value
-                exp_val_param = self.expected_value
+        self.model_call_time = \
+            self.model_call_time + (time.time() - model_timer)
+
+        l1_reg_time = 0
+
+        for i in range(self.model_dimensions):
+            if self.model_dimensions == 1:
+                y_hat = y - self._expected_value
+                exp_val_param = self._expected_value
                 fx_param = fx[0]
             else:
-                y_hat = y[:, i] - self.expected_value[i]
+                y_hat = y[:, i] - self._expected_value[i]
                 fx_param = fx[0][i]
-                exp_val_param = self.expected_value[i]
+                exp_val_param = self._expected_value[i]
 
             # get average of each combination of X
             y_hat = cp.mean(
@@ -423,6 +404,7 @@ class KernelExplainer(SHAPBase):
             nonzero_inds = None
             if ((self.ratio_evaluated < 0.2 and l1_reg == "auto") or
                     (self.ratio_evaluated < 1.0 and l1_reg != "auto")):
+                reg_timer = time.time()
                 nonzero_inds = _l1_regularization(self._mask,
                                                   y_hat,
                                                   self._weights,
@@ -430,11 +412,13 @@ class KernelExplainer(SHAPBase):
                                                   fx_param,
                                                   self.link_fn,
                                                   l1_reg)
-
+                self.l1_reg_time = \
+                    self.l1_reg_time + (time.time() - reg_timer)
                 # in case all indexes become zero
                 if nonzero_inds.shape == (0, ):
                     return None
 
+            reg_timer = time.time()
             shap_values[i][idx, :-1] = _weighted_linear_regression(
                 self._mask,
                 y_hat,
@@ -455,6 +439,24 @@ class KernelExplainer(SHAPBase):
                     (fx_param - exp_val_param) - cp.sum(
                         shap_values[i][idx, :-1])
 
+            self.linear_model_time = \
+                self.linear_model_time + (time.time() - reg_timer)
+
+        self.total_time = self.total_time + (time.time() - total_timer)
+
+    def _reset_timers(self):
+        super()._reset_timers()
+        self.l1_reg_time = 0
+        self.linear_model_time = 0
+
+    def _get_timers_str(self):
+        res_str = super()._get_timers_str()
+        res_str += "Time spent in L1 regularization: {}".format(
+            self.l1_reg_time)
+        res_str += "Time spent in linear model calculation: {}".format(
+            self.linear_model_time)
+        return res_str
+
 
 def _get_number_of_exact_random_samples(ncols, nsamples):
     """
@@ -466,7 +468,7 @@ def _get_number_of_exact_random_samples(ncols, nsamples):
     nsamples_exact = 0
     r = 0
 
-    # we check how many subsets of the _powerset of self.M we can fit
+    # we check how many subsets of the _powerset of self.ncols we can fit
     # in self.nsamples. This sets of the powerset are used  as indexes
     # to generate the mask matrix
     while cur_nsamples <= nsamples / 2:
diff --git a/python/cuml/experimental/explainer/permutation_shap.pyx b/python/cuml/experimental/explainer/permutation_shap.pyx
index d28ab86cab..5414aad736 100644
--- a/python/cuml/experimental/explainer/permutation_shap.pyx
+++ b/python/cuml/experimental/explainer/permutation_shap.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,19 +17,17 @@
 import cuml
 import cupy as cp
 import numpy as np
+import time
 
 from cudf import DataFrame as cu_df
 from cuml.common.array import CumlArray
 from cuml.common.import_utils import has_shap
 from cuml.common.input_utils import input_to_cupy_array
-from cuml.common.logger import warn
-from cuml.common.logger import info
 from cuml.experimental.explainer.base import SHAPBase
 from cuml.experimental.explainer.common import get_cai_ptr
 from cuml.experimental.explainer.common import get_dtype_from_model_func
 from cuml.experimental.explainer.common import get_tag_from_model_func
 from cuml.experimental.explainer.common import model_func_call
-from cuml.experimental.explainer.common import output_list_shap_values
 from numba import cuda
 from pandas import DataFrame as pd_df
 
@@ -60,16 +58,6 @@ cdef extern from "cuml/explainer/permutation_shap.hpp" namespace "ML":
         int* idx,
         bool rowMajor) except +
 
-    void shap_main_effect_dataset "ML::Explainer::shap_main_effect_dataset"(
-        const handle_t& handle,
-        float* dataset,
-        const float* background,
-        int n_rows,
-        int n_cols,
-        const float* row,
-        int* idx,
-        bool rowMajor) except +
-
     void update_perm_shap_values "ML::Explainer::update_perm_shap_values"(
         const handle_t& handle,
         float* shap_values,
@@ -201,8 +189,9 @@ class PermutationExplainer(SHAPBase):
     """
 
     def __init__(self,
+                 *,
                  model,
-                 masker,
+                 data,
                  masker_type='independent',
                  link='identity',
                  handle=None,
@@ -214,7 +203,7 @@ class PermutationExplainer(SHAPBase):
         super(PermutationExplainer, self).__init__(
             order='C',
             model=model,
-            background=masker,
+            background=data,
             link=link,
             verbose=verbose,
             random_state=random_state,
@@ -229,7 +218,8 @@ class PermutationExplainer(SHAPBase):
     def shap_values(self,
                     X,
                     npermutations=10,
-                    main_effects=False):
+                    as_list=True,
+                    **kwargs):
         """
         Interface to estimate the SHAP values for a set of samples.
         Corresponds to the SHAP package's legacy interface, and is our main
@@ -243,6 +233,11 @@ class PermutationExplainer(SHAPBase):
             DataFrame/Series.
         npermutations : int (default = 10)
             The l1 regularization to use for feature selection.
+        as_list : bool (default = True)
+            Set to True to return a list of arrays for multi-dimensional
+            models (like predict_proba functions) to match the SHAP package
+            shap_values API behavior.
+            Set to False to return them as an array of arrays.
 
         Returns
         -------
@@ -250,55 +245,22 @@ class PermutationExplainer(SHAPBase):
 
         """
         return self._explain(X,
+                             synth_data_shape=(
+                                 (2 * self.ncols * self.nrows + self.nrows),
+                                 self.ncols
+                             ),
                              npermutations=npermutations,
-                             main_effects=main_effects)
-
-    def _explain(self,
-                 X,
-                 npermutations=None,
-                 main_effects=False,
-                 testing=False):
-
-        X = input_to_cupy_array(X, order=self.order,
-                                convert_to_dtype=self.dtype)[0]
-
-        if X.ndim == 1:
-            X = X.reshape((1, self.M))
-
-        shap_values = []
-        for i in range(self.D):
-            shap_values.append(cp.zeros(X.shape, dtype=self.dtype))
-
-        # Allocate synthetic dataset array once for multiple explanations
-        if self._synth_data is None:
-            self._synth_data = cp.zeros(
-                shape=((2 * self.M * self.N + self.N), self.M),
-                dtype=self.dtype,
-                order=self.order
-            )
-
-        for idx, x in enumerate(X):
-            # use mutability of lists and cupy arrays to get all shap values
-            self._explain_single_observation(
-                shap_values,
-                x.reshape(1, self.M),
-                main_effects=main_effects,
-                npermutations=npermutations,
-                idx=idx,
-                testing=testing
-            )
-
-        return output_list_shap_values(shap_values, self.D, self.output_type)
+                             return_as_list=as_list,
+                             **kwargs)
 
     def _explain_single_observation(self,
                                     shap_values,
                                     row,
-                                    main_effects,
-                                    npermutations,
                                     idx,
-                                    testing):
-
-        inds = cp.arange(self.M, dtype=cp.int32)
+                                    npermutations=10,
+                                    testing=False):
+        total_timer = time.time()
+        inds = cp.arange(self.ncols, dtype=cp.int32)
 
         cdef handle_t* handle_ = \
             <handle_t*><size_t>self.handle.getHandle()
@@ -322,8 +284,8 @@ class PermutationExplainer(SHAPBase):
                 permutation_shap_dataset(handle_[0],
                                          <float*> ds_ptr,
                                          <float*> bg_ptr,
-                                         <int> self.N,
-                                         <int> self.M,
+                                         <int> self.nrows,
+                                         <int> self.ncols,
                                          <float*> row_ptr,
                                          <int*> idx_ptr,
                                          <bool> row_major)
@@ -331,8 +293,8 @@ class PermutationExplainer(SHAPBase):
                 permutation_shap_dataset(handle_[0],
                                          <double*> ds_ptr,
                                          <double*> bg_ptr,
-                                         <int> self.N,
-                                         <int> self.M,
+                                         <int> self.nrows,
+                                         <int> self.ncols,
                                          <double*> row_ptr,
                                          <int*> idx_ptr,
                                          <bool> row_major)
@@ -340,18 +302,21 @@ class PermutationExplainer(SHAPBase):
             self.handle.sync()
 
             # evaluate model on combinations
+            model_timer = time.time()
             y = model_func_call(X=self._synth_data,
                                 model_func=self.model,
                                 gpu_model=self.is_gpu_model)
+            self.model_call_time = \
+                self.model_call_time + (time.time() - model_timer)
 
-            for i in range(self.D):
+            for i in range(self.model_dimensions):
                 # reshape the results to coincide with each entry of the
                 # permutation
-                if self.D == 1:
-                    y_hat = y.reshape(2 * self.M + 1, len(self.background))
+                if self.model_dimensions == 1:
+                    y_hat = y.reshape(2 * self.ncols + 1, len(self.background))
 
                 else:
-                    y_hat = y[:, i].reshape(2 * self.M + 1,
+                    y_hat = y[:, i].reshape(2 * self.ncols + 1,
                                             len(self.background))
 
                 # we get the average of each entry
@@ -361,20 +326,21 @@ class PermutationExplainer(SHAPBase):
                 shap_ptr = get_cai_ptr(shap_values[i][idx])
                 y_hat_ptr = get_cai_ptr(y_hat)
 
-                # aggregation of results calculation matches mainline SHAP
                 if self.dtype == cp.float32:
                     update_perm_shap_values(handle_[0],
                                             <float*> shap_ptr,
                                             <float*> y_hat_ptr,
-                                            <int> self.M,
+                                            <int> self.ncols,
                                             <int*> idx_ptr)
                 else:
                     update_perm_shap_values(handle_[0],
                                             <double*> shap_ptr,
                                             <double*> y_hat_ptr,
-                                            <int> self.M,
+                                            <int> self.ncols,
                                             <int*> idx_ptr)
 
                 self.handle.sync()
 
         shap_values[0][idx] = shap_values[0][idx] / (2 * npermutations)
+
+        self.total_time = self.total_time + (time.time() - total_timer)
diff --git a/python/cuml/test/conftest.py b/python/cuml/test/conftest.py
index a5cad4b301..07c9a42de3 100644
--- a/python/cuml/test/conftest.py
+++ b/python/cuml/test/conftest.py
@@ -17,6 +17,7 @@
 import cupy as cp
 import pytest
 from sklearn.datasets import fetch_20newsgroups
+from sklearn.datasets import fetch_california_housing
 from sklearn.feature_extraction.text import CountVectorizer
 import zlib
 
@@ -39,3 +40,20 @@ def nlp_20news():
     Y = cp.array(twenty_train.target)
 
     return X, Y
+
+
+@pytest.fixture(scope="module")
+def housing_dataset():
+    try:
+        data = fetch_california_housing()
+
+    # failing to download has appeared as multiple varied errors in CI
+    except:  # noqa E722
+        pytest.xfail(reason="Error fetching housing dataset")
+
+    X = cp.array(data['data'])
+    y = cp.array(data['target'])
+
+    feature_names = data['feature_names']
+
+    return X, y, feature_names
diff --git a/python/cuml/test/experimental/test_explainer_base.py b/python/cuml/test/experimental/test_explainer_base.py
index 7a76c34b0f..fd4dc9ef06 100644
--- a/python/cuml/test/experimental/test_explainer_base.py
+++ b/python/cuml/test/experimental/test_explainer_base.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -52,8 +52,8 @@ def test_init_explainer_base_init_cuml_model(handle,
                          dtype=None,
                          output_type=None)
 
-    assert explainer.M == 2
-    assert explainer.N == 5
+    assert explainer.ncols == 2
+    assert explainer.nrows == 5
     assert np.all(cp.asnumpy(explainer.background) == bg)
     assert np.all(explainer.feature_names == bg_df.columns)
     assert explainer.is_gpu_model
@@ -101,8 +101,8 @@ def test_init_explainer_base_init_abritrary_model(handle,
                          dtype=None,
                          output_type=output_type)
 
-    assert explainer.M == 2
-    assert explainer.N == 5
+    assert explainer.ncols == 2
+    assert explainer.nrows == 5
     assert np.all(cp.asnumpy(explainer.background) == bg)
     if not is_gpu_model or is_gpu_model is None:
         assert not explainer.is_gpu_model
@@ -134,7 +134,7 @@ def test_init_explainer_base_wrong_dtype():
         explainer = SHAPBase(model=dummy_func,
                              background=np.ones(10),
                              dtype=np.int32)
-        explainer.M
+        explainer.ncols
 
 
 def dummy_func(x):
diff --git a/python/cuml/test/experimental/test_explainer_common.py b/python/cuml/test/experimental/test_explainer_common.py
index 04051c58c4..dac2361799 100644
--- a/python/cuml/test/experimental/test_explainer_common.py
+++ b/python/cuml/test/experimental/test_explainer_common.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -249,4 +249,4 @@ def create_dummy_model(model):
 def dummy_func(x):
     if not isinstance(x, np.ndarray):
         raise TypeError("x must be a NumPy array")
-    return np.mean(x)
+    return np.mean(x, axis=1)
diff --git a/python/cuml/test/experimental/test_explainer_kernel_shap.py b/python/cuml/test/experimental/test_explainer_kernel_shap.py
index 00b47fb7db..92af764c9b 100644
--- a/python/cuml/test/experimental/test_explainer_kernel_shap.py
+++ b/python/cuml/test/experimental/test_explainer_kernel_shap.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -56,14 +56,12 @@ def experimental_test_and_log(cu_shap_values,
                               fx,
                               expected,
                               tolerance=1e-02):
-    # while experimental we test the golden results but also fall back to
-    # testing the sum of the values to avoid instability in CI
     close_values = \
         np.allclose(cu_shap_values, golden_result_values,
                     rtol=tolerance, atol=tolerance)
 
     expected_sum = np.allclose(1.00, np.sum(cp.asnumpy(
-        cu_shap_values)) / (fx - expected), atol=1e-02)
+        cu_shap_values)) / (fx - expected), rtol=1e-01)
 
     if not close_values:
         print("cu_shap_values: ")
@@ -71,8 +69,8 @@ def experimental_test_and_log(cu_shap_values,
         print("golden_result_values")
         print(golden_result_values)
 
-    assert close_values or expected_sum
-
+    assert expected_sum
+    assert close_values
 
 ###############################################################################
 #                              End to end tests                               #
@@ -83,6 +81,8 @@ def experimental_test_and_log(cu_shap_values,
                                    cuml.KNeighborsRegressor,
                                    cuml.SVR])
 def test_exact_regression_datasets(exact_tests_dataset, model):
+    # todo (dd): idx parameter is for repeating the test for a few CI runs
+    # will be removed before merging
     X_train, X_test, y_train, y_test = exact_tests_dataset
 
     mod = model().fit(X_train, y_train)
@@ -136,10 +136,6 @@ def test_exact_classification_datasets():
 
     cu_shap_values = explainer.shap_values(X_test)
 
-    # assert np.allclose(cu_shap_values[0], golden_classification_result[0],
-    #                    rtol=1e-01, atol=1e-01)
-    # assert np.allclose(cu_shap_values[1], golden_classification_result[1],
-    #                    rtol=1e-01, atol=1e-01)
     experimental_test_and_log(cu_shap_values[0],
                               golden_classification_result[0],
                               float(mod.predict_proba(X_test)[0][0]),
@@ -164,10 +160,6 @@ def test_exact_classification_datasets():
     # a little looser to avoid false positives from comparisons like
     # 0.00348627 - 0.00247397. The loose tolerance still tests that the
     # distribution of the values matches.
-    # assert np.allclose(cu_shap_values[0], golden_classification_result[0],
-    #                    rtol=1e-01, atol=1e-01)
-    # assert np.allclose(cu_shap_values[1], golden_classification_result[1],
-    #                    rtol=1e-01, atol=1e-01)
     experimental_test_and_log(cu_shap_values[0],
                               golden_classification_result[0],
                               float(mod.predict_proba(X_test)[0][0]),
@@ -247,7 +239,7 @@ def test_kernel_gpu_cpu_shap(dtype, nfeatures, nbackground, model):
 
     cu_shap_values = cu_explainer.shap_values(X_test)
 
-    exp_v = cu_explainer.expected_value
+    exp_v = cu_explainer._expected_value
     fx = mod.predict(X_test)
     for test_idx in range(5):
         assert(np.sum(
@@ -263,6 +255,34 @@ def test_kernel_gpu_cpu_shap(dtype, nfeatures, nbackground, model):
         # test to be flaky, better testing strategy in process.
         assert np.allclose(cu_shap_values, shap_values, rtol=1e-01, atol=1e-01)
 
+
+def test_kernel_housing_dataset(housing_dataset):
+    X, y, _ = housing_dataset
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.25, random_state=42
+    )
+
+    # making all float32 to use gpu predict on random forest
+    X_train = X_train.astype(np.float32)
+    X_test = X_test.astype(np.float32)
+    y_train = y_train.astype(np.float32)
+    y_test = y_test.astype(np.float32)
+
+    cumodel = cuml.RandomForestRegressor().fit(X_train, y_train)
+
+    explainer = cuml.experimental.explainer.KernelExplainer(
+        model=cumodel.predict,
+        data=X_train[:100],
+        output_type='numpy')
+
+    cu_shap_values = explainer.shap_values(X_test[:2])
+
+    assert np.allclose(cu_shap_values, housing_regression_result,
+                       rtol=1e-01, atol=1e-01)
+
+    assert True
+
 ###############################################################################
 #                        Single function unit tests                           #
 ###############################################################################
@@ -396,6 +416,11 @@ def test_l1_regularization(exact_tests_dataset, l1_type):
      0.00088981]
 ]
 
+housing_regression_result = np.array(
+    [[-0.7222878, 0.00888237, -0.07044561, -0.02764106, -0.01486777,
+      -0.19961227, -0.1367276, -0.11073875],
+     [-0.688218, 0.04260924, -0.12853414, 0.06109668, -0.01486243,
+      -0.0627693, -0.17290883, -0.02488524]], dtype=np.float32)
 
 cuml_skl_class_dict = {
     cuml.LinearRegression: sklearn.linear_model.LinearRegression,
diff --git a/python/cuml/test/experimental/test_explainer_permutation_shap.py b/python/cuml/test/experimental/test_explainer_permutation_shap.py
index 08e8187a6e..3a5bb5b472 100644
--- a/python/cuml/test/experimental/test_explainer_permutation_shap.py
+++ b/python/cuml/test/experimental/test_explainer_permutation_shap.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ def test_regression_datasets(exact_tests_dataset, model):
 
     explainer = cuml.experimental.explainer.PermutationExplainer(
         model=mod.predict,
-        masker=X_train)
+        data=X_train)
 
     cu_shap_values = explainer.shap_values(X_test)
 
@@ -77,7 +77,7 @@ def test_regression_datasets(exact_tests_dataset, model):
 
     skmod = cuml_skl_class_dict[model]().fit(X_train, y_train)
 
-    explainer = cuml.experimental.explainer.KernelExplainer(
+    explainer = cuml.experimental.explainer.PermutationExplainer(
         model=skmod.predict,
         data=X_train)
 
@@ -106,7 +106,7 @@ def test_exact_classification_datasets():
 
     explainer = cuml.experimental.explainer.PermutationExplainer(
         model=mod.predict_proba,
-        masker=X_train)
+        data=X_train)
 
     cu_shap_values = explainer.shap_values(X_test)
 
@@ -121,7 +121,7 @@ def test_exact_classification_datasets():
 
     explainer = cuml.experimental.explainer.PermutationExplainer(
         model=mod.predict_proba,
-        masker=X_train)
+        data=X_train)
 
     skl_shap_values = explainer.shap_values(X_test)
 
@@ -157,7 +157,7 @@ def test_different_parameters(dtype, nfeatures, nbackground, model,
 
     cu_explainer = \
         cuml.experimental.explainer.PermutationExplainer(model=mod.predict,
-                                                         masker=X_train,
+                                                         data=X_train,
                                                          is_gpu_model=True)
 
     cu_shap_values = cu_explainer.shap_values(X_test,
@@ -185,12 +185,11 @@ def test_not_shuffled_explanation(exact_tests_dataset):
 
     explainer = cuml.experimental.explainer.PermutationExplainer(
         model=mod.predict,
-        masker=X_train)
+        data=X_train)
 
-    shap_values = explainer._explain(
+    shap_values = explainer.shap_values(
         X_test,
         npermutations=1,
-        main_effects=False,
         testing=True
     )
 
@@ -211,12 +210,11 @@ def test_permutation(exact_tests_dataset):
     X_foreground = np.ones((1, X_train.shape[1]))
     explainer = cuml.experimental.explainer.PermutationExplainer(
         model=mod.predict,
-        masker=X_background)
+        data=X_background)
 
-    shap_values = explainer._explain(
+    shap_values = explainer.shap_values(
         X_foreground,
         npermutations=5,
-        main_effects=False
     )
 
     assert np.allclose(mod.coef_, shap_values, rtol=1e-04, atol=1e-04)

From ae2aedac241749191dee9818eaec08dcb09097d0 Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Wed, 10 Feb 2021 03:12:47 +0100
Subject: [PATCH 25/29] Adding warning for IVFPQ (#3472)

Following observations in #3318

Authors:
  - Victor Lafargue (@viclafargue)

Approvers:
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3472
---
 python/cuml/neighbors/nearest_neighbors.pyx | 10 ++++++++++
 python/cuml/test/test_nearest_neighbors.py  | 15 ++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx
index bd1ecaa565..6da182e17c 100644
--- a/python/cuml/neighbors/nearest_neighbors.pyx
+++ b/python/cuml/neighbors/nearest_neighbors.pyx
@@ -294,6 +294,10 @@ class NearestNeighbors(Base):
     Notes
     -----
 
+    Warning: IVFPQ might be unstable in this version of cuML.
+    This is due to a known issue in the FAISS release that this
+    cuML version is linked to. (see FAISS issue #1421)
+
     For an additional example see `the NearestNeighbors notebook
     <https://github.com/rapidsai/cuml/blob/branch-0.15/notebooks/nearest_neighbors_demo.ipynb>`_.
 
@@ -363,6 +367,12 @@ class NearestNeighbors(Base):
         cdef handle_t* handle_ = <handle_t*><uintptr_t> self.handle.getHandle()
         cdef knnIndexParam* algo_params = <knnIndexParam*> 0
         if self.algorithm in ['ivfflat', 'ivfpq', 'ivfsq']:
+            if self.algorithm == 'ivfpq':
+                warnings.warn("\nWarning: IVFPQ might be unstable in this "
+                              "version of cuML. This is due to a known issue "
+                              "in the FAISS release that this cuML version "
+                              "is linked to. (see FAISS issue #1421)")
+
             if not is_dense(X):
                 raise ValueError("Approximate Nearest Neigbors methods "
                                  "require dense data")
diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index b0cf1f6e7a..136293a364 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -156,8 +156,10 @@ def test_self_neighboring(datatype, metric_p, nrows):
 def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                   datatype, algo):
     if algo == "ivfpq":
-        pytest.xfail("""See Memory access error in IVFPQ :
-                        https://github.com/rapidsai/cuml/issues/3318""")
+        pytest.xfail("Warning: IVFPQ might be unstable in this "
+                     "version of cuML. This is due to a known issue "
+                     "in the FAISS release that this cuML version "
+                     "is linked to. (see FAISS issue #1421)")
 
     if not has_scipy():
         pytest.skip('Skipping test_neighborhood_predictions because ' +
@@ -220,11 +222,14 @@ def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist):
 @pytest.mark.parametrize("nrows", [4000])
 @pytest.mark.parametrize("ncols", [128, 512])
 @pytest.mark.parametrize("n_neighbors", [8])
-@pytest.mark.xfail
-#  See Memory access error in IVFPQ :
-#  https://github.com/rapidsai/cuml/issues/3318
 def test_ivfpq_pred(nrows, ncols, n_neighbors,
                     nlist, M, n_bits, usePrecomputedTables):
+
+    pytest.xfail("Warning: IVFPQ might be unstable in this "
+                 "version of cuML. This is due to a known issue "
+                 "in the FAISS release that this cuML version "
+                 "is linked to. (see FAISS issue #1421)")
+
     algo_params = {
         'nlist': nlist,
         'nprobe': int(nlist * 0.2),

From f1abbeb90522090154cc1de657667162fc2a8628 Mon Sep 17 00:00:00 2001
From: John Zedlewski <904524+JohnZed@users.noreply.github.com>
Date: Tue, 9 Feb 2021 18:31:22 -0800
Subject: [PATCH 26/29] Minor doc updates for 0.18 (#3475)

Just small readme and api.rst updates for new algos.

Authors:
  - John Zedlewski (@JohnZed)

Approvers:
  - Michael Demoret (@mdemoret-nv)

URL: https://github.com/rapidsai/cuml/pull/3475
---
 README.md           | 3 ++-
 docs/source/api.rst | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 06fb7ed861..bd782fc658 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 ### Supported Algorithms
 | Category | Algorithm | Notes |
 | --- | --- | --- |
-| **Clustering** |  Density-Based Spatial Clustering of Applications with Noise (DBSCAN) | |
+| **Clustering** |  Density-Based Spatial Clustering of Applications with Noise (DBSCAN) | Multi-node multi-GPU via Dask |
 |  | K-Means | Multi-node multi-GPU via Dask |
 | **Dimensionality Reduction** | Principal Components Analysis (PCA) | Multi-node multi-GPU via Dask|
 | | Incremental PCA | |
@@ -96,6 +96,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-node multi-GPU via Dask |
 | | Linear Regression with Lasso or Ridge Regularization | Multi-node multi-GPU via Dask |
 | | ElasticNet Regression | |
+| | LARS Regression | (experimental) |
 | | Logistic Regression | Multi-node multi-GPU via Dask-GLM [demo](https://github.com/daxiongshu/rapids-demos) |
 | | Naive Bayes | Multi-node multi-GPU via Dask |
 | | Stochastic Gradient Descent (SGD), Coordinate Descent (CD), and Quasi-Newton (QN) (including L-BFGS and OWL-QN) solvers for linear models  | |
diff --git a/docs/source/api.rst b/docs/source/api.rst
index b173c68727..839721a6eb 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -539,3 +539,8 @@ Model Explanation (SHAP)
 
 .. autoclass:: cuml.experimental.explainer.PermutationExplainer
    :members:
+
+Linear Models
+-------------
+.. autoclass:: cuml.experimental.linear_model.Lars
+   :members:

From 1d486c7f0840428b8bcb716e2492df186a2b44fb Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Wed, 10 Feb 2021 20:52:50 +0100
Subject: [PATCH 27/29] Fix illegal memory accesses when NITEMS > 1, and nrows
 % NITEMS != 0. (#3480)

This is based on #3469. All of the credit for detecting the bug, and part of the credit for fixing it goes to @levsnv.

Authors:
  - Andy Adinets (@canonizer)

Approvers:
  - @levsnv
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3480
---
 cpp/src/fil/infer.cu | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 52f49efd6a..0bf6f6bf24 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -67,34 +67,55 @@ struct ArgMax {
   }
 };
 
+/** tree_leaf_output returns the leaf outputs from the tree with leaf indices
+    given by leaves for n_rows items. FULL_ITEMS indicates whether n_rows ==
+    NITEMS, to allow the compiler to skip the conditional when unrolling the
+    loop. */
+template <typename output_type, bool FULL_NITEMS, int NITEMS,
+          typename tree_type>
+__device__ __forceinline__ vec<NITEMS, output_type> tree_leaf_output(
+  tree_type tree, int n_rows, int (&leaves)[NITEMS]) {
+  vec<NITEMS, output_type> out(0);
+#pragma unroll
+  for (int j = 0; j < NITEMS; ++j) {
+    if (FULL_NITEMS || j < n_rows) {
+      /** dependent names are not considered templates by default, unless it's a
+          member of a current [template] instantiation. As output<>() is a
+          member function inherited from the base class, template
+          output<output_type>() is required. */
+      out[j] = tree[leaves[j]].template output<output_type>();
+    }
+  }
+  return out;
+}
+
 template <int NITEMS, typename output_type, typename tree_type>
 __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
   tree_type tree, const float* input, int cols, int n_rows) {
+  // find the leaf nodes for each row
   int curr[NITEMS];
   // the first n_rows are active
-  int mask = ((1 << n_rows) - 1) << (NITEMS - n_rows);
+  int mask = (1 << n_rows) - 1;
   for (int j = 0; j < NITEMS; ++j) curr[j] = 0;
   do {
 #pragma unroll
     for (int j = 0; j < NITEMS; ++j) {
       auto n = tree[curr[j]];
       mask &= ~(n.is_leaf() << j);
-      if (!n.is_leaf()) {
+      if ((mask & (1 << j)) != 0) {
         float val = input[j * cols + n.fid()];
         bool cond = isnan(val) ? !n.def_left() : val >= n.thresh();
         curr[j] = n.left(curr[j]) + cond;
       }
     }
   } while (mask != 0);
-  vec<NITEMS, output_type> out;
-#pragma unroll
-  for (int j = 0; j < NITEMS; ++j) {
-    /** dependent names are not considered templates by default,
-        unless it's a member of a current [template] instantiation.
-        alternatively, could have used .base_node::output<... */
-    out[j] = tree[curr[j]].template output<output_type>();
+
+  // get the output from the leaves
+  if (n_rows == NITEMS) {
+    return tree_leaf_output<output_type, true>(tree, n_rows, curr);
+  } else {
+    return tree_leaf_output<output_type, false>(tree, n_rows, curr);
   }
-  return out;
 }
 
 template <typename output_type, typename tree_type>

From 1aac6b2ea09a7a8b10d0e4af6945d1e1cd9eb594 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Wed, 10 Feb 2021 14:57:44 -0600
Subject: [PATCH 28/29] FIX xgboost version in GPU CI

---
 ci/gpu/build.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b944361e68..9c98a20508 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -53,7 +53,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
       "ucx-py=${MINOR_VERSION}" \
-      "xgboost=1.3.0dev.rapidsai${MINOR_VERSION}" \
+      "xgboost=1.3.3dev.rapidsai${MINOR_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
@@ -110,7 +110,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     ################################################################################
     # TEST - Run GoogleTest and py.tests for libcuml and cuML
     ################################################################################
-    
+
     if hasArg --skip-tests; then
         gpuci_logger "Skipping Tests"
         exit 0
@@ -124,7 +124,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     cd $WORKSPACE/cpp/build
     GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml
 
-    
+
     gpuci_logger "Python pytest for cuml"
     cd $WORKSPACE/python
 
@@ -165,7 +165,7 @@ else
     #Project Flash
     export LIBCUML_BUILD_DIR="$WORKSPACE/ci/artifacts/cuml/cpu/conda_work/cpp/build"
     export LD_LIBRARY_PATH="$LIBCUML_BUILD_DIR:$LD_LIBRARY_PATH"
-    
+
     if hasArg --skip-tests; then
         gpuci_logger "Skipping Tests"
         exit 0
@@ -191,7 +191,7 @@ else
     CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
     gpuci_logger "Installing $CONDA_FILE"
     conda install -c $WORKSPACE/ci/artifacts/cuml/cpu/conda-bld/ "$CONDA_FILE"
-        
+
     gpuci_logger "Building cuml"
     "$WORKSPACE/build.sh" -v cuml --codecov
 
@@ -205,7 +205,7 @@ else
     ################################################################################
     # TEST - Run notebook tests
     ################################################################################
-    
+
     gpuci_logger "Notebook tests"
     set +e -Eo pipefail
     EXITCODE=0

From 676b68ff97620366caef1ec4cefe69a1debafb65 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Wed, 10 Feb 2021 15:13:25 -0600
Subject: [PATCH 29/29] FIX Copyright year

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 9c98a20508..77e502612e 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##############################################
 # cuML GPU build and test script for CI      #
 ##############################################