From f86d76586e8a4c2f825ad724fe3c68e3fa3b9b74 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 18:44:45 -0500
Subject: [PATCH 01/24] iMoving more linalg prims from cuml

---
 cpp/include/raft/linalg/detail/axpy.hpp       |   1 -
 cpp/include/raft/linalg/detail/gemm.hpp       |   1 -
 cpp/include/raft/linalg/detail/gemv.hpp       |   1 -
 cpp/include/raft/linalg/detail/lstsq.hpp      | 444 ++++++++++++++++++
 .../raft/linalg/detail/reduce_cols_by_key.cuh |  81 ++++
 .../raft/linalg/detail/reduce_rows_by_key.cuh | 431 +++++++++++++++++
 cpp/include/raft/linalg/detail/rsvd.cuh       | 412 ++++++++++++++++
 cpp/include/raft/linalg/detail/ternary_op.cuh | 102 ++++
 cpp/include/raft/linalg/lstsq.hpp             |  98 ++++
 cpp/include/raft/linalg/power.cuh             |  63 +++
 .../raft/linalg/reduce_cols_by_key.cuh        |  54 +++
 .../raft/linalg/reduce_rows_by_key.cuh        |  97 ++++
 cpp/include/raft/linalg/rsvd.cuh              | 109 +++++
 cpp/include/raft/linalg/sqrt.cuh              |  44 ++
 cpp/include/raft/linalg/ternary_op.cuh        |  49 ++
 cpp/test/CMakeLists.txt                       |   5 +
 cpp/test/linalg/power.cu                      | 135 ++++++
 cpp/test/linalg/reduce_cols_by_key.cu         | 124 +++++
 cpp/test/linalg/reduce_rows_by_key.cu         | 262 +++++++++++
 cpp/test/linalg/rsvd.cu                       | 315 +++++++++++++
 cpp/test/linalg/sqrt.cu                       | 114 +++++
 cpp/test/linalg/ternary_op.cu                 | 107 +++++
 22 files changed, 3046 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/raft/linalg/detail/lstsq.hpp
 create mode 100644 cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/detail/rsvd.cuh
 create mode 100644 cpp/include/raft/linalg/detail/ternary_op.cuh
 create mode 100644 cpp/include/raft/linalg/lstsq.hpp
 create mode 100644 cpp/include/raft/linalg/power.cuh
 create mode 100644 cpp/include/raft/linalg/reduce_cols_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/reduce_rows_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/rsvd.cuh
 create mode 100644 cpp/include/raft/linalg/sqrt.cuh
 create mode 100644 cpp/include/raft/linalg/ternary_op.cuh
 create mode 100644 cpp/test/linalg/power.cu
 create mode 100644 cpp/test/linalg/reduce_cols_by_key.cu
 create mode 100644 cpp/test/linalg/reduce_rows_by_key.cu
 create mode 100644 cpp/test/linalg/rsvd.cu
 create mode 100644 cpp/test/linalg/sqrt.cu
 create mode 100644 cpp/test/linalg/ternary_op.cu

diff --git a/cpp/include/raft/linalg/detail/axpy.hpp b/cpp/include/raft/linalg/detail/axpy.hpp
index f5527bf10f..c0ce398de9 100644
--- a/cpp/include/raft/linalg/detail/axpy.hpp
+++ b/cpp/include/raft/linalg/detail/axpy.hpp
@@ -20,7 +20,6 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
 namespace raft::linalg::detail {
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 0ea1723a9e..29308304d8 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -20,7 +20,6 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 3692743152..ad2e5275cb 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -20,7 +20,6 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp
new file mode 100644
index 0000000000..e8aeccc9b0
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/lstsq.hpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/common/nvtx.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/svd.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+    namespace linalg {
+        namespace detail {
+
+            namespace {
+
+/** Operate a CUDA event if we're in the concurrent mode; no-op otherwise. */
+                struct DeviceEvent {
+                private:
+                    cudaEvent_t e;
+
+                public:
+                    DeviceEvent(bool concurrent) {
+                        if (concurrent)
+                            RAFT_CUDA_TRY(cudaEventCreate(&e));
+                        else
+                            e = nullptr;
+                    }
+
+                    ~DeviceEvent() {
+                        if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e));
+                    }
+
+                    operator cudaEvent_t() const { return e; }
+
+                    void record(cudaStream_t stream) {
+                        if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream));
+                    }
+
+                    void wait(cudaStream_t stream) {
+                        if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u));
+                    }
+
+                    void wait() {
+                        if (e != nullptr) raft::interruptible::synchronize(e);
+                    }
+
+                    DeviceEvent &operator=(const DeviceEvent &other) = delete;
+                };
+
+/**
+ *  @brief Tells if the viewed CUDA stream is implicitly synchronized with the given stream.
+ *
+ *  This can happen e.g.
+ *   if the two views point to the same stream
+ *   or sometimes when one of them is the legacy default stream.
+ */
+                bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b) {
+                    // any stream is "synchronized" with itself
+                    if (a.value() == b.value()) return true;
+                    // legacy + blocking streams
+                    unsigned int flags = 0;
+                    if (a.is_default()) {
+                        RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags));
+                        if ((flags & cudaStreamNonBlocking) == 0) return true;
+                    }
+                    if (b.is_default()) {
+                        RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags));
+                        if ((flags & cudaStreamNonBlocking) == 0) return true;
+                    }
+                    return false;
+                }
+
+                template<typename math_t>
+                struct DivideByNonZero {
+                    constexpr static const math_t
+                    eps = math_t(1e-10);
+
+                    __device__ math_t
+
+                    operator()(const math_t a, const math_t b) const {
+                        return raft::myAbs<math_t>(b) >= eps ? a / b : a;
+                    }
+                };
+
+            }  // namespace
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+            template<typename math_t>
+            void lstsqSvdQR(const raft::handle_t &handle,
+                            math_t *A,
+                            const int n_rows,
+                            const int n_cols,
+                            const math_t *b,
+                            math_t *w,
+                            cudaStream_t stream) {
+                const int minmn = min(n_rows, n_cols);
+                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+                int cusolverWorkSetSize = 0;
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize<math_t>(
+                        cusolverH, n_rows, n_cols, &cusolverWorkSetSize));
+
+                rmm::device_uvector <math_t> workset(cusolverWorkSetSize  // cuSolver
+                                                     + n_rows * minmn   // U
+                                                     + n_cols * n_cols  // V
+                                                     + minmn            // S
+                                                     + minmn            // U^T * b
+                                                     + 1                // devInfo
+                        ,
+                                                     stream);
+                math_t *cusolverWorkSet = workset.data();
+                math_t *U = cusolverWorkSet + cusolverWorkSetSize;
+                math_t *Vt = U + n_rows * minmn;
+                math_t *S = Vt + n_cols * n_cols;
+                math_t *Ub = S + minmn;
+                int *devInfo = reinterpret_cast<int *>(Ub + minmn);
+
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd<math_t>(cusolverH,
+                                                                                'S',
+                                                                                'S',
+                                                                                n_rows,
+                                                                                n_cols,
+                                                                                A,
+                                                                                n_rows,
+                                                                                S,
+                                                                                U,
+                                                                                n_rows,
+                                                                                Vt,
+                                                                                n_cols,
+                                                                                cusolverWorkSet,
+                                                                                cusolverWorkSetSize,
+                                                                                nullptr,
+                                                                                devInfo,
+                                                                                stream));
+                raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+                raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+                raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream);
+            }
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+            template<typename math_t>
+            void lstsqSvdJacobi(const raft::handle_t &handle,
+                                math_t *A,
+                                const int n_rows,
+                                const int n_cols,
+                                const math_t *b,
+                                math_t *w,
+                                cudaStream_t stream) {
+                const int minmn = min(n_rows, n_cols);
+                gesvdjInfo_t gesvdj_params;
+                RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+                int cusolverWorkSetSize = 0;
+                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(
+                        raft::linalg::detail::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
+                                                                                  CUSOLVER_EIG_MODE_VECTOR,
+                                                                                  1,
+                                                                                  n_rows,
+                                                                                  n_cols,
+                                                                                  A,
+                                                                                  n_rows,
+                                                                                  nullptr,
+                                                                                  nullptr,
+                                                                                  n_rows,
+                                                                                  nullptr,
+                                                                                  n_cols,
+                                                                                  &cusolverWorkSetSize,
+                                                                                  gesvdj_params));
+                rmm::device_uvector <math_t> workset(cusolverWorkSetSize  // cuSolver
+                                                     + n_rows * minmn   // U
+                                                     + n_cols * minmn   // V
+                                                     + minmn            // S
+                                                     + minmn            // U^T * b
+                                                     + 1                // devInfo
+                        ,
+                                                     stream);
+                math_t *cusolverWorkSet = workset.data();
+                math_t *U = cusolverWorkSet + cusolverWorkSetSize;
+                math_t *V = U + n_rows * minmn;
+                math_t *S = V + n_cols * minmn;
+                math_t *Ub = S + minmn;
+                int *devInfo = reinterpret_cast<int *>(Ub + minmn);
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj<math_t>(cusolverH,
+                                                                                 CUSOLVER_EIG_MODE_VECTOR,
+                                                                                 1,
+                                                                                 n_rows,
+                                                                                 n_cols,
+                                                                                 A,
+                                                                                 n_rows,
+                                                                                 S,
+                                                                                 U,
+                                                                                 n_rows,
+                                                                                 V,
+                                                                                 n_cols,
+                                                                                 cusolverWorkSet,
+                                                                                 cusolverWorkSetSize,
+                                                                                 devInfo,
+                                                                                 gesvdj_params,
+                                                                                 stream));
+                raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+                raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+                raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream);
+            }
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+            template<typename math_t>
+            void lstsqEig(const raft::handle_t &handle,
+                          const math_t *A,
+                          const int n_rows,
+                          const int n_cols,
+                          const math_t *b,
+                          math_t *w,
+                          cudaStream_t stream) {
+                rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream);
+                rmm::cuda_stream_view multAbStream = mainStream;
+                bool concurrent = false;
+                {
+                    int sp_size = handle.get_stream_pool_size();
+                    if (sp_size > 0) {
+                        multAbStream = handle.get_stream_from_stream_pool(0);
+                        // check if the two streams can run concurrently
+                        if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+                            concurrent = true;
+                        } else if (sp_size > 1) {
+                            mainStream = multAbStream;
+                            multAbStream = handle.get_stream_from_stream_pool(1);
+                            concurrent = true;
+                        }
+                    }
+                }
+                // the event is created only if the given raft handle is capable of running
+                // at least two CUDA streams without implicit synchronization.
+                DeviceEvent multAbDone(concurrent);
+
+                rmm::device_uvector <math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+                math_t *Q = workset.data();
+                math_t *QS = Q + n_cols * n_cols;
+                math_t *covA = QS + n_cols * n_cols;
+                math_t *S = covA + n_cols * n_cols;
+                math_t *Ab = S + n_cols;
+
+                // covA <- A* A
+                math_t alpha = math_t(1);
+                math_t beta = math_t(0);
+                raft::linalg::gemm(handle,
+                                   A,
+                                   n_rows,
+                                   n_cols,
+                                   A,
+                                   covA,
+                                   n_cols,
+                                   n_cols,
+                                   CUBLAS_OP_T,
+                                   CUBLAS_OP_N,
+                                   alpha,
+                                   beta,
+                                   mainStream);
+
+                // Ab <- A* b
+                raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream);
+                multAbDone.record(multAbStream);
+
+                // Q S Q* <- covA
+                raft::common::nvtx::push_range("raft::linalg::eigDC");
+                raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
+                raft::common::nvtx::pop_range();
+
+                // QS  <- Q invS
+                raft::linalg::matrixVectorOp(
+                        QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero<math_t>(), mainStream);
+                // covA <- QS Q* == Q invS Q* == inv(A* A)
+                raft::linalg::gemm(handle,
+                                   QS,
+                                   n_cols,
+                                   n_cols,
+                                   Q,
+                                   covA,
+                                   n_cols,
+                                   n_cols,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   alpha,
+                                   beta,
+                                   mainStream);
+                multAbDone.wait(mainStream);
+                // w <- covA Ab == Q invS Q* A b == inv(A* A) A b
+                raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream);
+            }
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param A[in/out] - input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param b[in/out] - input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ */
+            template<typename math_t>
+            void lstsqQR(const raft::handle_t &handle,
+                         math_t *A,
+                         const int n_rows,
+                         const int n_cols,
+                         math_t *b,
+                         math_t *w,
+                         cudaStream_t stream) {
+                cublasHandle_t cublasH = handle.get_cublas_handle();
+                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+                int m = n_rows;
+                int n = n_cols;
+
+                int info = 0;
+                rmm::device_uvector <math_t> d_tau(n, stream);
+                rmm::device_scalar<int> d_info(stream);
+
+                const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
+                const cublasOperation_t trans = CUBLAS_OP_T;
+
+                int lwork_geqrf = 0;
+                int lwork_ormqr = 0;
+                int lwork = 0;
+
+                const int lda = m;
+                const int ldb = m;
+
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(
+                        raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf));
+
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH,
+                                                                                   side,
+                                                                                   trans,
+                                                                                   m,
+                                                                                   1,
+                                                                                   n,
+                                                                                   A,
+                                                                                   lda,
+                                                                                   d_tau.data(),
+                                                                                   b,    // C,
+                                                                                   lda,  // ldc,
+                                                                                   &lwork_ormqr));
+
+                lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr;
+
+                rmm::device_uvector <math_t> d_work(lwork, stream);
+
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
+                        cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+                RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+                ASSERT(0 == info, "lstsq.h: QR wasn't successful");
+
+                // #TODO: Call from public API when ready
+                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH,
+                                                                        side,
+                                                                        trans,
+                                                                        m,
+                                                                        1,
+                                                                        n,
+                                                                        A,
+                                                                        lda,
+                                                                        d_tau.data(),
+                                                                        b,
+                                                                        ldb,
+                                                                        d_work.data(),
+                                                                        lwork,
+                                                                        d_info.data(),
+                                                                        stream));
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+                RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+                ASSERT(0 == info, "lstsq.h: QR wasn't successful");
+
+                const math_t one = 1;
+
+                // #TODO: Call from public API when ready
+                RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH,
+                                                                 side,
+                                                                 CUBLAS_FILL_MODE_UPPER,
+                                                                 CUBLAS_OP_N,
+                                                                 CUBLAS_DIAG_NON_UNIT,
+                                                                 n,
+                                                                 1,
+                                                                 &one,
+                                                                 A,
+                                                                 lda,
+                                                                 b,
+                                                                 ldb,
+                                                                 stream));
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream));
+            }
+
+        };  // namespace detail
+    };  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
new file mode 100644
index 0000000000..307ed30c57
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <stdlib.h>
+
+namespace raft {
+    namespace linalg {
+        namespace detail {
+
+///@todo: support col-major
+///@todo: specialize this to support shared-mem based atomics
+
+        template<typename T, typename KeyIteratorT, typename IdxType>
+        __global__ void reduce_cols_by_key_kernel(
+                const T *data, const KeyIteratorT keys, T *out, IdxType nrows, IdxType ncols, IdxType nkeys) {
+            typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+
+            IdxType idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx >= (nrows * ncols)) return;
+            ///@todo: yikes! use fast-int-div
+            IdxType colId = idx % ncols;
+            IdxType rowId = idx / ncols;
+            KeyType key = keys[colId];
+            raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]);
+        }
+
+/**
+ * @brief Computes the sum-reduction of matrix columns for each given key
+ * @tparam T the input data type (as well as the output reduced matrix)
+ * @tparam KeyType data type of the keys
+ * @tparam IdxType indexing arithmetic type
+ * @param data the input data (dim = nrows x ncols). This is assumed to be in
+ * row-major layout
+ * @param keys keys array (len = ncols). It is assumed that each key in this
+ * array is between [0, nkeys). In case this is not true, the caller is expected
+ * to have called make_monotonic primitive to prepare such a contiguous and
+ * monotonically increasing keys array.
+ * @param out the output reduced matrix along columns (dim = nrows x nkeys).
+ * This will be assumed to be in row-major layout
+ * @param nrows number of rows in the input data
+ * @param ncols number of colums in the input data
+ * @param nkeys number of unique keys in the keys array
+ * @param stream cuda stream to launch the kernel onto
+ */
+        template<typename T, typename KeyIteratorT, typename IdxType = int>
+        void reduce_cols_by_key(const T *data,
+                                const KeyIteratorT keys,
+                                T *out,
+                                IdxType nrows,
+                                IdxType ncols,
+                                IdxType nkeys,
+                                cudaStream_t stream) {
+            typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+
+            RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream));
+            constexpr int TPB = 256;
+            int nblks = (int) raft::ceildiv<IdxType>(nrows * ncols, TPB);
+            reduce_cols_by_key_kernel<<<nblks, TPB, 0, stream>>>(data, keys, out, nrows, ncols, nkeys);
+            RAFT_CUDA_TRY(cudaPeekAtLastError());
+        }
+    };  // end namespace detail
+    };  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
new file mode 100644
index 0000000000..f4fa892472
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+
+#include <cub/cub.cuh>
+
+#include <limits>
+
+#define MAX_BLOCKS 65535u
+namespace raft {
+    namespace linalg {
+        namespace detail {
+
+//
+// Small helper function to convert from int->char and char->int
+// Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+//
+
+            template<typename IteratorT1, typename IteratorT2>
+            void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) {
+                for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
+                    dst[idx] = src[idx];
+                }
+            }
+
+            template<typename IteratorT1, typename IteratorT2>
+            void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) {
+                dim3 grid, block;
+                block.x = 256;
+
+                grid.x = raft::ceildiv(n, (int) block.x);
+                grid.x = std::min(grid.x, MAX_BLOCKS);
+
+                convert_array_kernel<<<grid, block, 0, st>>>(dst, src, n);
+            }
+
+            template<typename T>
+            struct quad {
+                T x, y, z, w;
+            };
+
+//
+// Functor for reduce by key, small k
+//
+            template<typename T>
+            struct quadSum {
+                __host__ __device__ __forceinline__ quad<T> operator()(const quad<T> &a, const quad<T> &b) const {
+                    // wasting a double4..
+                    quad<T> c;
+                    c.x = a.x + b.x;
+                    c.y = a.y + b.y;
+                    c.z = a.z + b.z;
+                    c.w = a.w + b.w;
+
+                    return c;
+                }
+            };
+
+//
+// Reduce by keys
+// We need to sum each dimension by labels
+// The labels are not adjacent
+//
+
+//
+// Reduce by keys - for keys <= 4
+//
+
+#define SUM_ROWS_SMALL_K_DIMX         256
+#define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4
+            template<typename DataIteratorT, typename WeightT>
+            __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
+
+            __global__
+            void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+                                                    int lda,
+                                                    const char *d_keys,
+                                                    const WeightT *d_weights,
+                                                    int nrows,
+                                                    int ncols,
+                                                    int nkeys,
+                                                    DataIteratorT d_sums) {
+                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+                typedef cub::BlockReduce<quad<DataType>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
+                __shared__ typename BlockReduce::TempStorage temp_storage;
+
+                for (int idim = static_cast<int>(blockIdx.y); idim < ncols; idim += gridDim.y) {
+                    if (idim != static_cast<int>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
+
+                    // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg
+                    quad<DataType> thread_sums;
+                    thread_sums.x = 0.0;
+                    thread_sums.y = 0.0;
+                    thread_sums.z = 0.0;
+                    thread_sums.w = 0.0;
+
+                    // May use vectorized load - not necessary for doubles
+                    for (int block_offset_irow = blockIdx.x * blockDim.x;
+                         block_offset_irow < nrows;  // we will syncthreads() inside the loop, no CTA divergence
+                         block_offset_irow += blockDim.x * gridDim.x) {
+                        int irow = block_offset_irow + threadIdx.x;
+                        DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0;
+                        if (d_weights && irow < nrows) { val = val * d_weights[irow]; }
+                        // we are not reusing the keys - after profiling
+                        // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded
+                        // (experimentation gave a 10% speed up - not worth the many code lines added)
+                        int row_key = (irow < nrows) ? d_keys[irow] : -1;
+
+                        thread_sums.x += (row_key == 0) ? val : 0.0;
+                        thread_sums.y += (row_key == 1) ? val : 0.0;
+                        thread_sums.z += (row_key == 2) ? val : 0.0;
+                        thread_sums.w += (row_key == 3) ? val : 0.0;
+                    }
+
+                    // End of column
+                    // Saving local sums back to global mem
+
+                    // Strided access
+
+                    // Reducing by key
+                    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<DataType>());
+
+                    if (threadIdx.x < 32) {
+                        // We only need 4
+                        thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff);
+                        if (static_cast<int>(threadIdx.x) < nkeys) {
+                            if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x);
+                            if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y);
+                            if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z);
+                            if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w);
+                        }
+                    }
+                }
+            }
+
+            template<typename DataIteratorT, typename WeightT>
+            void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
+                                             int lda,
+                                             const char *d_keys,
+                                             const WeightT *d_weights,
+                                             int nrows,
+                                             int ncols,
+                                             int nkeys,
+                                             DataIteratorT d_sums,
+                                             cudaStream_t st) {
+                dim3 grid, block;
+                block.x = SUM_ROWS_SMALL_K_DIMX;
+                block.y = 1;  // Necessary
+
+                grid.x = raft::ceildiv(nrows, (int) block.x);
+                grid.x = std::min(grid.x, 32u);
+                grid.y = ncols;
+                grid.y = std::min(grid.y, MAX_BLOCKS);
+                sum_rows_by_key_small_nkeys_kernel<<<grid, block, 0, st>>>(
+                        d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums);
+            }
+
+//
+// Reduce by keys - large number of keys
+// Computing a "weigthed histogram" with local histograms in smem
+// Keeping it simple - not optimized
+//
+
+#define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024
+
+            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+            __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+                                                                        int lda,
+                                                                        const KeysIteratorT d_keys,
+                                                                        const WeightT *d_weights,
+                                                                        int nrows,
+                                                                        int ncols,
+                                                                        int key_offset,
+                                                                        int nkeys,
+                                                                        DataIteratorT d_sums) {
+                typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+                __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
+
+                for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
+                    local_sums[local_key] = 0.0;
+
+                for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
+                    __syncthreads();  // local_sums
+
+                    // At this point local_sums if full of zeros
+
+                    for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
+                         irow += blockDim.x * gridDim.x) {
+                        // Branch div in this loop - not an issue with current code
+                        DataType val = d_A[idim * lda + irow];
+                        if (d_weights) val = val * d_weights[irow];
+
+                        int local_key = d_keys[irow] - key_offset;
+
+                        // We could load next val here
+                        raft::myAtomicAdd(&local_sums[local_key], val);
+                    }
+
+                    __syncthreads();  // local_sums
+
+                    for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
+                        DataType local_sum = local_sums[local_key];
+
+                        if (local_sum != 0.0) {
+                            KeyType global_key = key_offset + local_key;
+                            raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum);
+                            local_sums[local_key] = 0.0;
+                        }
+                    }
+                }
+            }
+
+            template<typename DataIteratorT, typename KeysIteratorT>
+            void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A,
+                                                      int lda,
+                                                      KeysIteratorT d_keys,
+                                                      int nrows,
+                                                      int ncols,
+                                                      int key_offset,
+                                                      int nkeys,
+                                                      DataIteratorT d_sums,
+                                                      cudaStream_t st) {
+                dim3 grid, block;
+                block.x = SUM_ROWS_SMALL_K_DIMX;
+                block.y = 1;  // Necessary
+
+                grid.x = raft::ceildiv(nrows, (int) block.x);
+                grid.x = std::min(grid.x, 32u);
+                grid.y = ncols;
+                grid.y = std::min(grid.y, MAX_BLOCKS);
+                sum_rows_by_key_large_nkeys_kernel_colmajor<<<grid, block, 0, st>>>(
+                        d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+            }
+
+#define RRBK_SHMEM_SZ 32
+
+//#define RRBK_SHMEM
+            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+            __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+                                                                        int lda,
+                                                                        const WeightT *d_weights,
+                                                                        KeysIteratorT d_keys,
+                                                                        int nrows,
+                                                                        int ncols,
+                                                                        int key_offset,
+                                                                        int nkeys,
+                                                                        DataIteratorT d_sums) {
+                typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+
+#ifdef RRBK_SHMEM
+                __shared__ KeyType sh_keys[RRBK_SHMEM_SZ];
+#endif
+                int rows_per_partition = nrows / gridDim.z + 1;
+                int start_row = blockIdx.z * rows_per_partition;
+                int end_row = start_row + rows_per_partition;
+                end_row = end_row > nrows ? nrows : end_row;
+
+                KeyType local_key = blockIdx.y;
+                if (local_key >= nkeys) return;
+                int this_col = threadIdx.x + blockIdx.x * blockDim.x;
+                if (this_col >= ncols) return;
+
+                DataType sum = 0.0;
+                KeyType global_key = key_offset + local_key;
+#ifdef RRBK_SHMEM
+                int sh_key_inx = 0;
+#endif
+                for (int r = start_row; r < end_row; r++) {
+#ifdef RRBK_SHMEM
+                    if (0 == sh_key_inx % RRBK_SHMEM_SZ) {
+          for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x)
+            sh_keys[x] = d_keys[r + x];
+          __syncthreads();
+        }
+        if (sh_keys[sh_key_inx] != global_key) continue;  // No divergence since global_key is the
+        // same for the whole block
+        sh_key_inx++;
+#else
+                    if (d_keys[r] != global_key)
+                        continue;  // No divergence since global_key is the
+                    // same for the whole block
+#endif
+                    // if ((end_row-start_row) / (r-start_row) != global_key) continue;
+                    DataType val = __ldcg(&d_A[r * lda + this_col]);
+                    if (d_weights) { val = val * d_weights[r]; }
+                    sum += val;
+                }
+
+                if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum);
+            }
+
+            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+            void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A,
+                                                      int lda,
+                                                      const KeysIteratorT d_keys,
+                                                      const WeightT *d_weights,
+                                                      int nrows,
+                                                      int ncols,
+                                                      int key_offset,
+                                                      int nkeys,
+                                                      DataIteratorT d_sums,
+                                                      cudaStream_t st) {
+                // x-dim refers to the column in the input data
+                // y-dim refers to the key
+                // z-dim refers to a partitioning of the rows among the threadblocks
+                dim3 grid, block;
+                block.x = 256;  // Adjust me!
+                block.y = 1;    // Don't adjust me!
+                grid.x = raft::ceildiv(ncols, (int) block.x);
+                grid.y = nkeys;
+                grid.z = std::max(40960000 / nkeys / ncols, (int) 1);  // Adjust me!
+                grid.z = std::min(grid.z, (unsigned int) nrows);
+                grid.z = std::min(grid.z, MAX_BLOCKS);
+
+                sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid, block, 0, st>>>(
+                        d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+            }
+
+/**
+ * @brief Computes the weighted reduction of matrix rows for each given key
+ *
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix
+ *                       (may be a simple pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys
+ *                       (may be a simple pointer type)
+ *
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+            void reduce_rows_by_key(const DataIteratorT d_A,
+                                    int lda,
+                                    const KeysIteratorT d_keys,
+                                    const WeightT *d_weights,
+                                    char *d_keys_char,
+                                    int nrows,
+                                    int ncols,
+                                    int nkeys,
+                                    DataIteratorT d_sums,
+                                    cudaStream_t stream) {
+                typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+
+                // Following kernel needs memset
+                cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream);
+
+                if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
+                    // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW
+                    // with doubles we have ~20% speed up - with floats we can hope something around 2x
+                    // Converting d_keys to char
+                    convert_array(d_keys_char, d_keys, nrows, stream);
+                    sum_rows_by_key_small_nkeys(
+                            d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream);
+                } else {
+                    for (KeyType key_offset = 0; key_offset < static_cast<KeyType>(nkeys);
+                         key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) {
+                        KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys);
+                        sum_rows_by_key_large_nkeys_rowmajor(
+                                d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream);
+                    }
+                }
+            }
+
+/**
+ * @brief Computes the reduction of matrix rows for each given key
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
+ * pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
+ * pointer type)
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param      d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+            template<typename DataIteratorT, typename KeysIteratorT>
+            void reduce_rows_by_key(const DataIteratorT d_A,
+                                    int lda,
+                                    const KeysIteratorT d_keys,
+                                    char *d_keys_char,
+                                    int nrows,
+                                    int ncols,
+                                    int nkeys,
+                                    DataIteratorT d_sums,
+                                    cudaStream_t stream) {
+                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+                reduce_rows_by_key(d_A,
+                                   lda,
+                                   d_keys,
+                                   static_cast<DataType *>(nullptr),
+                                   d_keys_char,
+                                   nrows,
+                                   ncols,
+                                   nkeys,
+                                   d_sums,
+                                   stream);
+            }
+
+        };  // end namespace detail
+    };  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
new file mode 100644
index 0000000000..700ce43735
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/svd.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+    namespace linalg {
+        namespace detail {
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying no. of PCs and
+ * upsamples directly
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param k: no. of singular values to be computed
+ * @param p: no. of upsamples
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+            template<typename math_t>
+            void rsvdFixedRank(const raft::handle_t &handle,
+                               math_t *M,
+                               int n_rows,
+                               int n_cols,
+                               math_t *S_vec,
+                               math_t *U,
+                               math_t *V,
+                               int k,
+                               int p,
+                               bool use_bbt,
+                               bool gen_left_vec,
+                               bool gen_right_vec,
+                               bool use_jacobi,
+                               math_t tol,
+                               int max_sweeps,
+                               cudaStream_t stream) {
+                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+                cublasHandle_t cublasH = handle.get_cublas_handle();
+
+                // All the notations are following Algorithm 4 & 5 in S. Voronin's paper:
+                // https://arxiv.org/abs/1502.05366
+
+                int m = n_rows, n = n_cols;
+                int l = k + p;  // Total number of singular values to be computed before truncation
+                int q = 2;      // Number of power sampling counts
+                int s = 1;      // Frequency controller for QR decomposition during power sampling
+                // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s
+                // > 2: less frequent QR
+
+                const math_t alpha = 1.0, beta = 0.0;
+
+                // Build temporary U, S, V matrices
+                rmm::device_uvector <math_t> S_vec_tmp(l, stream);
+                RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+
+                // build random matrix
+                rmm::device_uvector <math_t> RN(n * l, stream);
+                raft::random::Rng rng(484);
+                rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream);
+
+                // multiply to get matrix of random samples Y
+                rmm::device_uvector <math_t> Y(m * l, stream);
+                raft::linalg::gemm(
+                        handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+
+                // now build up (M M^T)^q R
+                rmm::device_uvector <math_t> Z(n * l, stream);
+                rmm::device_uvector <math_t> Yorth(m * l, stream);
+                rmm::device_uvector <math_t> Zorth(n * l, stream);
+                RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
+                RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
+                RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+
+                // power sampling scheme
+                for (int j = 1; j < q; j++) {
+                    if ((2 * j - 2) % s == 0) {
+                        raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream);
+                        raft::linalg::gemm(handle,
+                                           M,
+                                           m,
+                                           n,
+                                           Yorth.data(),
+                                           Z.data(),
+                                           n,
+                                           l,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                    } else {
+                        raft::linalg::gemm(
+                                handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
+                                stream);
+                    }
+
+                    if ((2 * j - 1) % s == 0) {
+                        raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream);
+                        raft::linalg::gemm(handle,
+                                           M,
+                                           m,
+                                           n,
+                                           Zorth.data(),
+                                           Y.data(),
+                                           m,
+                                           l,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                    } else {
+                        raft::linalg::gemm(
+                                handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta,
+                                stream);
+                    }
+                }
+
+                // orthogonalize on exit from loop to get Q
+                rmm::device_uvector <math_t> Q(m * l, stream);
+                RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream));
+                raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream);
+
+                // either QR of B^T method, or eigendecompose BB^T method
+                if (!use_bbt) {
+                    // form Bt = Mt*Q : nxm * mxl = nxl
+                    rmm::device_uvector <math_t> Bt(n * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+                    raft::linalg::gemm(
+                            handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+
+                    // compute QR factorization of Bt
+                    // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */
+                    rmm::device_uvector <math_t> Qhat(n * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+                    rmm::device_uvector <math_t> Rhat(l * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+                    raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream);
+
+                    // compute SVD of Rhat (lxl)
+                    rmm::device_uvector <math_t> Uhat(l * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+                    rmm::device_uvector <math_t> Vhat(l * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+                    if (use_jacobi)
+                        raft::linalg::svdJacobi(handle,
+                                                Rhat.data(),
+                                                l,
+                                                l,
+                                                S_vec_tmp.data(),
+                                                Uhat.data(),
+                                                Vhat.data(),
+                                                true,
+                                                true,
+                                                tol,
+                                                max_sweeps,
+                                                stream);
+                    else
+                        raft::linalg::svdQR(handle,
+                                            Rhat.data(),
+                                            l,
+                                            l,
+                                            S_vec_tmp.data(),
+                                            Uhat.data(),
+                                            Vhat.data(),
+                                            true,
+                                            true,
+                                            true,
+                                            stream);
+                    raft::matrix::sliceMatrix(S_vec_tmp.data(),
+                                              1,
+                                              l,
+                                              S_vec,
+                                              0,
+                                              0,
+                                              1,
+                                              k,
+                                              stream);  // First k elements of S_vec
+
+                    // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk
+                    if (gen_left_vec) {
+                        raft::linalg::gemm(handle,
+                                           Q.data(),
+                                           m,
+                                           l,
+                                           Vhat.data(),
+                                           U,
+                                           m,
+                                           k /*used to be l and needs slicing*/,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                    }
+
+                    // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk
+                    if (gen_right_vec) {
+                        raft::linalg::gemm(handle,
+                                           Qhat.data(),
+                                           n,
+                                           l,
+                                           Uhat.data(),
+                                           V,
+                                           n,
+                                           k /*used to be l and needs slicing*/,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                    }
+                } else {
+                    // build the matrix B B^T = Q^T M M^T Q column by column
+                    // Bt = M^T Q ; nxm * mxk = nxk
+                    rmm::device_uvector <math_t> B(n * l, stream);
+                    raft::linalg::gemm(
+                            handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+
+                    rmm::device_uvector <math_t> BBt(l * l, stream);
+                    raft::linalg::gemm(handle,
+                                       B.data(),
+                                       l,
+                                       n,
+                                       B.data(),
+                                       BBt.data(),
+                                       l,
+                                       l,
+                                       CUBLAS_OP_N,
+                                       CUBLAS_OP_T,
+                                       alpha,
+                                       beta,
+                                       stream);
+
+                    // compute eigendecomposition of BBt
+                    rmm::device_uvector <math_t> Uhat(l * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+                    rmm::device_uvector <math_t> Uhat_dup(l * l, stream);
+                    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+                    raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream);
+                    if (use_jacobi)
+                        raft::linalg::eigJacobi(
+                                handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps);
+                    else
+                        raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream);
+                    raft::matrix::seqRoot(S_vec_tmp.data(), l, stream);
+                    raft::matrix::sliceMatrix(S_vec_tmp.data(),
+                                              1,
+                                              l,
+                                              S_vec,
+                                              0,
+                                              p,
+                                              1,
+                                              l,
+                                              stream);  // Last k elements of S_vec
+                    raft::matrix::colReverse(S_vec, 1, k, stream);
+
+                    // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk
+                    if (gen_left_vec) {
+                        raft::linalg::gemm(handle,
+                                           Q.data(),
+                                           m,
+                                           l,
+                                           Uhat.data() + p * l,
+                                           U,
+                                           m,
+                                           k,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                        raft::matrix::colReverse(U, m, k, stream);
+                    }
+
+                    // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] *
+                    // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk
+                    if (gen_right_vec) {
+                        rmm::device_uvector <math_t> Sinv(k * k, stream);
+                        RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+                        rmm::device_uvector <math_t> UhatSinv(l * k, stream);
+                        RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+                        raft::matrix::reciprocal(S_vec_tmp.data(), l, stream);
+                        raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream);
+
+                        raft::linalg::gemm(handle,
+                                           Uhat.data() + p * l,
+                                           l,
+                                           k,
+                                           Sinv.data(),
+                                           UhatSinv.data(),
+                                           l,
+                                           k,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                        raft::linalg::gemm(handle,
+                                           B.data(),
+                                           l,
+                                           n,
+                                           UhatSinv.data(),
+                                           V,
+                                           n,
+                                           k,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           alpha,
+                                           beta,
+                                           stream);
+                        raft::matrix::colReverse(V, n, k, stream);
+                    }
+                }
+            }
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying the PC and upsampling
+ * ratio
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param PC_perc: percentage of singular values to be computed
+ * @param UpS_perc: upsampling percentage
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+            template<typename math_t>
+            void rsvdPerc(const raft::handle_t &handle,
+                          math_t *M,
+                          int n_rows,
+                          int n_cols,
+                          math_t *S_vec,
+                          math_t *U,
+                          math_t *V,
+                          math_t PC_perc,
+                          math_t UpS_perc,
+                          bool use_bbt,
+                          bool gen_left_vec,
+                          bool gen_right_vec,
+                          bool use_jacobi,
+                          math_t tol,
+                          int max_sweeps,
+                          cudaStream_t stream) {
+                int k = max((int) (min(n_rows, n_cols) * PC_perc),
+                            1);  // Number of singular values to be computed
+                int p = max((int) (min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
+                rsvdFixedRank(handle,
+                              M,
+                              n_rows,
+                              n_cols,
+                              S_vec,
+                              U,
+                              V,
+                              k,
+                              p,
+                              use_bbt,
+                              gen_left_vec,
+                              gen_right_vec,
+                              use_jacobi,
+                              tol,
+                              max_sweeps,
+                              stream);
+            }
+
+        };  // end namespace detail
+    };  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh
new file mode 100644
index 0000000000..935ffed190
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/ternary_op.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+    namespace linalg {
+namespace detail {
+    template<typename math_t, int veclen_, typename Lambda, typename IdxType>
+    __global__ void ternaryOpKernel(
+            math_t *out, const math_t *in1, const math_t *in2, const math_t *in3, IdxType len, Lambda op) {
+        typedef raft::TxN_t<math_t, veclen_> VecType;
+        VecType a, b, c;
+        IdxType idx = threadIdx.x + ((IdxType) blockIdx.x * blockDim.x);
+        idx *= VecType::Ratio;
+        if (idx >= len) return;
+        a.load(in1, idx);
+        b.load(in2, idx);
+        c.load(in3, idx);
+#pragma unroll
+        for (int i = 0; i < VecType::Ratio; ++i) {
+            a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
+        }
+        a.store(out, idx);
+    }
+
+    template<typename math_t, int veclen_, typename Lambda, typename IdxType, int TPB>
+    void ternaryOpImpl(math_t *out,
+                       const math_t *in1,
+                       const math_t *in2,
+                       const math_t *in3,
+                       IdxType len,
+                       Lambda op,
+                       cudaStream_t stream) {
+        const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType) TPB);
+        ternaryOpKernel<math_t, veclen_, Lambda, IdxType>
+        <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+    template<typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+    void ternaryOp(math_t *out,
+                   const math_t *in1,
+                   const math_t *in2,
+                   const math_t *in3,
+                   IdxType len,
+                   Lambda op,
+                   cudaStream_t stream) {
+        size_t bytes = len * sizeof(math_t);
+        if (16 / sizeof(math_t) && bytes % 16 == 0) {
+            ternaryOpImpl<math_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
+                    out, in1, in2, in3, len, op, stream);
+        } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
+            ternaryOpImpl<math_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
+                    out, in1, in2, in3, len, op, stream);
+        } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
+            ternaryOpImpl<math_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
+                    out, in1, in2, in3, len, op, stream);
+        } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
+            ternaryOpImpl<math_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
+                    out, in1, in2, in3, len, op, stream);
+        } else if (1 / sizeof(math_t)) {
+            ternaryOpImpl<math_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
+                    out, in1, in2, in3, len, op, stream);
+        } else {
+            ternaryOpImpl<math_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
+        }
+    }
+
+};  // end namespace detail
+    };  // end namespace linalg
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
new file mode 100644
index 0000000000..bfa302eb4f
--- /dev/null
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/linalg/detail/lstsq.cuh>
+namespace raft {
+namespace linalg {
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
+{
+    detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+    detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+    detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param A[in/out] - input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param b[in/out] - input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ */
+template <typename math_t>
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
+             math_t* b,
+             math_t* w,
+             cudaStream_t stream)
+{
+    detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
new file mode 100644
index 0000000000..1a39d4b3ba
--- /dev/null
+++ b/cpp/include/raft/linalg/power.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+    namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template<typename math_t, typename IdxType = int>
+void powerScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cudaStream_t stream) {
+    raft::linalg::unaryOp(
+            out, in, len,[scalar] __device__(math_t
+    in) { return raft::myPow(in, scalar); }, stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template<typename math_t, typename IdxType = int>
+void power(math_t *out, const math_t *in1, const math_t *in2, IdxType len, cudaStream_t stream) {
+    raft::linalg::binaryOp(
+            out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
+}
+/** @} */
+
+    };  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
new file mode 100644
index 0000000000..c6e163d491
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_cols_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+
+/**
+ * @brief Computes the sum-reduction of matrix columns for each given key
+ * @tparam T the input data type (as well as the output reduced matrix)
+ * @tparam KeyType data type of the keys
+ * @tparam IdxType indexing arithmetic type
+ * @param data the input data (dim = nrows x ncols). This is assumed to be in
+ * row-major layout
+ * @param keys keys array (len = ncols). It is assumed that each key in this
+ * array is between [0, nkeys). In case this is not true, the caller is expected
+ * to have called make_monotonic primitive to prepare such a contiguous and
+ * monotonically increasing keys array.
+ * @param out the output reduced matrix along columns (dim = nrows x nkeys).
+ * This will be assumed to be in row-major layout
+ * @param nrows number of rows in the input data
+ * @param ncols number of colums in the input data
+ * @param nkeys number of unique keys in the keys array
+ * @param stream cuda stream to launch the kernel onto
+ */
+template<typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T *data,
+                        const KeyIteratorT keys,
+                        T *out,
+                        IdxType nrows,
+                        IdxType ncols,
+                        IdxType nkeys,
+                        cudaStream_t stream) {
+    detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
+}
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
new file mode 100644
index 0000000000..3b5345a540
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_rows_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+/**
+ * @brief Computes the weighted reduction of matrix rows for each given key
+ *
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix
+ *                       (may be a simple pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys
+ *                       (may be a simple pointer type)
+ *
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        const WeightT *d_weights,
+                        char *d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream) {
+    detail::reduce_rows_by_key(d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+}
+
+/**
+ * @brief Computes the reduction of matrix rows for each given key
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
+ * pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
+ * pointer type)
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param      d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template<typename DataIteratorT, typename KeysIteratorT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        char *d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream) {
+    typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+    reduce_rows_by_key(d_A,
+                       lda,
+                       d_keys,
+                       static_cast<DataType *>(nullptr),
+                       d_keys_char,
+                       nrows,
+                       ncols,
+                       nkeys,
+                       d_sums,
+                       stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
new file mode 100644
index 0000000000..e789abce30
--- /dev/null
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/rsvd.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying no. of PCs and
+ * upsamples directly
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param k: no. of singular values to be computed
+ * @param p: no. of upsamples
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template<typename math_t>
+void rsvdFixedRank(const raft::handle_t &handle,
+                   math_t *M,
+                   int n_rows,
+                   int n_cols,
+                   math_t *S_vec,
+                   math_t *U,
+                   math_t *V,
+                   int k,
+                   int p,
+                   bool use_bbt,
+                   bool gen_left_vec,
+                   bool gen_right_vec,
+                   bool use_jacobi,
+                   math_t tol,
+                   int max_sweeps,
+                   cudaStream_t stream) {
+
+    detail::rsvdFixedRank(handle, M, n_rows, n_cols, S_vec, U, V, k, p, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream);
+}
+
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying the PC and upsampling
+ * ratio
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param PC_perc: percentage of singular values to be computed
+ * @param UpS_perc: upsampling percentage
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template<typename math_t>
+void rsvdPerc(const raft::handle_t &handle,
+              math_t *M,
+              int n_rows,
+              int n_cols,
+              math_t *S_vec,
+              math_t *U,
+              math_t *V,
+              math_t PC_perc,
+              math_t UpS_perc,
+              bool use_bbt,
+              bool gen_left_vec,
+              bool gen_right_vec,
+              bool use_jacobi,
+              math_t tol,
+              int max_sweeps,
+              cudaStream_t stream) {
+    detail::rsvdPerc(handle, M, n_rows, n_cols, S_vec, U, V, PC_perc, UpS_perc, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
new file mode 100644
index 0000000000..49eb6788ef
--- /dev/null
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
+{
+    raft::linalg::unaryOp(
+            out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
new file mode 100644
index 0000000000..99e21fd5a0
--- /dev/null
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/ternary_op.cuh>
+
+namespace raft {
+namespace linalg {
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template<typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t *out,
+               const math_t *in1,
+               const math_t *in2,
+               const math_t *in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream) {
+    detail::ternaryOp(out, in1, in2, in3, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index fda60e1cb0..c0db20f650 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -56,10 +56,15 @@ add_executable(test_raft
     test/linalg/matrix_vector_op.cu
     test/linalg/multiply.cu
     test/linalg/norm.cu
+    test/linalg/power.cu
     test/linalg/reduce.cu
+    test/linalg/reduce_cols_by_key.cu
+    test/linalg/rsvd.cu
+    test/linalg/sqrt.cu
     test/linalg/strided_reduction.cu
     test/linalg/subtract.cu
     test/linalg/svd.cu
+    test/linalg/ternary_op.cu
     test/linalg/transpose.cu
     test/linalg/unary_op.cu
     test/matrix/math.cu
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
new file mode 100644
index 0000000000..8f336d583f
--- /dev/null
+++ b/cpp/test/linalg/power.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/linalg/power.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+namespace linalg {
+
+        template <typename Type>
+        __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+        {
+            int idx = threadIdx.x + blockIdx.x * blockDim.x;
+            if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); }
+        }
+
+        template <typename Type>
+        void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+        {
+            static const int TPB = 64;
+            int nblks            = raft::ceildiv(len, TPB);
+            naivePowerElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+            RAFT_CUDA_TRY(cudaPeekAtLastError());
+        }
+
+        template <typename Type>
+        __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+        {
+            int idx = threadIdx.x + blockIdx.x * blockDim.x;
+            if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); }
+        }
+
+        template <typename Type>
+        void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
+        {
+            static const int TPB = 64;
+            int nblks            = raft::ceildiv(len, TPB);
+            naivePowerScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+            RAFT_CUDA_TRY(cudaPeekAtLastError());
+        }
+
+        template <typename T>
+        struct PowerInputs {
+            T tolerance;
+            int len;
+            unsigned long long int seed;
+        };
+
+        template <typename T>
+        ::std::ostream& operator<<(::std::ostream& os, const PowerInputs<T>& dims)
+        {
+            return os;
+        }
+
+        template <typename T>
+        class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
+        protected:
+            PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {}
+
+            void SetUp() override
+            {
+                params = ::testing::TestWithParam<PowerInputs<T>>::GetParam();
+                raft::random::Rng r(params.seed);
+                int len = params.len;
+                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+                in1.resize(len, stream);
+                in2.resize(len, stream);
+                out_ref.resize(len, stream);
+                out.resize(len, stream);
+                r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
+                r.uniform(in2.data(), len, T(1.0), T(2.0), stream);
+
+                naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream);
+                naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream);
+
+                power(out.data(), in1.data(), in2.data(), len, stream);
+                powerScalar(out.data(), out.data(), T(2), len, stream);
+                power(in1.data(), in1.data(), in2.data(), len, stream);
+                powerScalar(in1.data(), in1.data(), T(2), len, stream);
+                RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+            }
+
+        protected:
+            cudaStream_t stream = 0;
+            PowerInputs<T> params;
+            rmm::device_uvector<T> in1, in2, out_ref, out;
+            int device_count = 0;
+        };
+
+        const std::vector<PowerInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+
+        const std::vector<PowerInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+
+        typedef PowerTest<float> PowerTestF;
+        TEST_P(PowerTestF, Result)
+    {
+        ASSERT_TRUE(raft::devArrMatch(
+                out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+
+        ASSERT_TRUE(raft::devArrMatch(
+                out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+    }
+
+    typedef PowerTest<double> PowerTestD;
+    TEST_P(PowerTestD, Result)
+{
+    ASSERT_TRUE(raft::devArrMatch(
+            out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+
+    ASSERT_TRUE(raft::devArrMatch(
+            out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestD, ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
new file mode 100644
index 0000000000..55057b4894
--- /dev/null
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/linalg/reduce_cols_by_key.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+    namespace linalg {
+
+        template <typename T>
+        void naiveReduceColsByKey(const T* in,
+                                  const uint32_t* keys,
+                                  T* out_ref,
+                                  uint32_t nrows,
+                                  uint32_t ncols,
+                                  uint32_t nkeys,
+                                  cudaStream_t stream)
+        {
+            std::vector<uint32_t> h_keys(ncols, 0u);
+            raft::copy(&(h_keys[0]), keys, ncols, stream);
+            std::vector<T> h_in(nrows * ncols);
+            raft::copy(&(h_in[0]), in, nrows * ncols, stream);
+            raft::interruptible::synchronize(stream);
+            std::vector<T> out(nrows * nkeys, T(0));
+            for (uint32_t i = 0; i < nrows; ++i) {
+                for (uint32_t j = 0; j < ncols; ++j) {
+                    out[i * nkeys + h_keys[j]] += h_in[i * ncols + j];
+                }
+            }
+            raft::copy(out_ref, &(out[0]), nrows * nkeys, stream);
+            raft::interruptible::synchronize(stream);
+        }
+
+        template <typename T>
+        struct ReduceColsInputs {
+            T tolerance;
+            uint32_t rows;
+            uint32_t cols;
+            uint32_t nkeys;
+            unsigned long long int seed;
+        };
+
+        template <typename T>
+        ::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs<T>& dims)
+        {
+            return os;
+        }
+
+        template <typename T>
+        class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T>> {
+        protected:
+            ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {}
+
+            void SetUp() override
+            {
+                params = ::testing::TestWithParam<ReduceColsInputs<T>>::GetParam();
+                raft::random::Rng r(params.seed);
+                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+                auto nrows = params.rows;
+                auto ncols = params.cols;
+                auto nkeys = params.nkeys;
+                in.resize(nrows * ncols, stream);
+                keys.resize(ncols, stream);
+                out_ref.resize(nrows * nkeys, stream);
+                out.resize(nrows * nkeys, stream);
+                r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream);
+                r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream);
+                naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream);
+                reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream);
+                raft::interruptible::synchronize(stream);
+            }
+
+            void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+        protected:
+            cudaStream_t stream = 0;
+            ReduceColsInputs<T> params;
+            rmm::device_uvector<T> in, out_ref, out;
+            rmm::device_uvector<uint32_t> keys;
+        };
+
+        const std::vector<ReduceColsInputs<float>> inputsf = {{0.0001f, 128, 32, 6, 1234ULL},
+                                                              {0.0005f, 121, 63, 10, 1234ULL}};
+        typedef ReduceColsTest<float> ReduceColsTestF;
+        TEST_P(ReduceColsTestF, Result)
+    {
+        ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                      out.data(),
+                                      params.rows * params.nkeys,
+                                      raft::CompareApprox<float>(params.tolerance)));
+    }
+    INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf));
+
+    const std::vector<ReduceColsInputs<double>> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL},
+                                                            {0.0000001, 121, 63, 10, 1234ULL}};
+    typedef ReduceColsTest<double> ReduceColsTestD;
+    TEST_P(ReduceColsTestD, Result)
+{
+    ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                  out.data(),
+                                  params.rows * params.nkeys,
+                                  raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestD, ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
new file mode 100644
index 0000000000..e6dc8cef7f
--- /dev/null
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+    namespace linalg {
+
+        template <typename Type>
+        __global__ void naiveReduceRowsByKeyKernel(const Type* d_A,
+                                                   int lda,
+                                                   uint32_t* d_keys,
+                                                   const Type* d_weight,
+                                                   char* d_char_keys,
+                                                   int nrows,
+                                                   int ncols,
+                                                   int nkeys,
+                                                   Type* d_sums)
+        {
+            int c = threadIdx.x + blockIdx.x * blockDim.x;
+            if (c >= ncols) return;
+            int this_key = threadIdx.y + blockIdx.y * blockDim.y;
+
+            Type sum = 0.0;
+            for (int r = 0; r < nrows; r++) {
+                if (this_key != d_keys[r]) continue;
+                Type wt = 1;
+                if (d_weight) wt = d_weight[r];
+                sum += d_A[lda * r + c] * wt;
+            }
+            d_sums[this_key * ncols + c] = sum;
+        }
+        template <typename Type>
+        void naiveReduceRowsByKey(const Type* d_A,
+                                  int lda,
+                                  uint32_t* d_keys,
+                                  const Type* d_weight,
+                                  char* d_char_keys,
+                                  int nrows,
+                                  int ncols,
+                                  int nkeys,
+                                  Type* d_sums,
+                                  cudaStream_t stream)
+        {
+            cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols);
+
+            naiveReduceRowsByKeyKernel<<<dim3((ncols + 31) / 32, nkeys), dim3(32, 1), 0, stream>>>(
+                    d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums);
+        }
+
+        template <typename T>
+        struct ReduceRowsInputs {
+            T tolerance;
+            int nobs;
+            uint32_t cols;
+            uint32_t nkeys;
+            unsigned long long int seed;
+            bool weighted;
+            T max_weight;
+        };
+
+        template <typename T>
+        ::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs<T>& dims)
+        {
+            return os;
+        }
+
+        template <typename T>
+        class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
+        public:
+            ReduceRowTest()
+                    : params(::testing::TestWithParam<ReduceRowsInputs<T>>::GetParam()),
+                      stream(handle.get_stream()),
+                      in(params.nobs * params.cols, stream),
+                      out(params.nkeys * params.cols, stream),
+                      out_ref(params.nkeys * params.cols, stream),
+                      keys(params.nobs, stream),
+                      scratch_buf(params.nobs, stream)
+            {
+            }
+
+        protected:
+            void SetUp() override
+            {
+                raft::random::Rng r(params.seed);
+                raft::random::Rng r_int(params.seed);
+
+                int nobs       = params.nobs;
+                uint32_t cols  = params.cols;
+                uint32_t nkeys = params.nkeys;
+                r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream);
+                r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream);
+
+                rmm::device_uvector<T> weight(0, stream);
+                if (params.weighted) {
+                    weight.resize(nobs, stream);
+                    raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox);
+                    r.uniform(weight.data(), nobs, T(1), params.max_weight, stream);
+                }
+
+                naiveReduceRowsByKey(in.data(),
+                                     cols,
+                                     keys.data(),
+                                     params.weighted ? weight.data() : nullptr,
+                                     scratch_buf.data(),
+                                     nobs,
+                                     cols,
+                                     nkeys,
+                                     out_ref.data(),
+                                     stream);
+                if (params.weighted) {
+                    reduce_rows_by_key(in.data(),
+                                       cols,
+                                       keys.data(),
+                                       params.weighted ? weight.data() : nullptr,
+                                       scratch_buf.data(),
+                                       nobs,
+                                       cols,
+                                       nkeys,
+                                       out.data(),
+                                       stream);
+                } else {
+                    reduce_rows_by_key(
+                            in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream);
+                }
+                handle.sync_stream(stream);
+            }
+
+        protected:
+            ReduceRowsInputs<T> params;
+            raft::handle_t handle;
+            cudaStream_t stream = 0;
+
+            int device_count = 0;
+            rmm::device_uvector<T> in, out, out_ref;
+            rmm::device_uvector<uint32_t> keys;
+            rmm::device_uvector<char> scratch_buf;
+        };
+
+// ReduceRowTestF
+// 128 Obs, 32 cols, 6 clusters
+        const std::vector<ReduceRowsInputs<float>> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false},
+                                                               {0.000001f, 128, 32, 6, 1234ULL, true, 1.0},
+                                                               {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}};
+        typedef ReduceRowTest<float> ReduceRowTestF;
+        TEST_P(ReduceRowTestF, Result)
+    {
+        ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                      out.data(),
+                                      params.cols * params.nkeys,
+                                      raft::CompareApprox<float>(params.tolerance)));
+    }
+    INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2));
+
+// ReduceRowTestD
+// 128 Obs, 32 cols, 6 clusters, double precision
+    const std::vector<ReduceRowsInputs<double>> inputsd2 = {
+            {0.00000001, 128, 32, 6, 1234ULL, false},
+            {0.00000001, 128, 32, 6, 1234ULL, true, 2.0},
+            {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}};
+    typedef ReduceRowTest<double> ReduceRowTestD;
+    TEST_P(ReduceRowTestD, Result)
+{
+    ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                  out.data(),
+                                  params.cols * params.nkeys,
+                                  raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestD, ::testing::ValuesIn(inputsd2));
+
+// ReduceRowTestSmallnKey
+// 128 Obs, 32 cols, 3 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf_small_nkey = {
+        {0.000001f, 128, 32, 3, 1234ULL, false},
+        {0.000001f, 128, 32, 3, 1234ULL, true, 5.0},
+        {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}};
+typedef ReduceRowTest<float> ReduceRowTestSmallnKey;
+TEST_P(ReduceRowTestSmallnKey, Result)
+{
+ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                              out.data(),
+                              params.cols * params.nkeys,
+                              raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+        ReduceRowTestSmallnKey,
+        ::testing::ValuesIn(inputsf_small_nkey));
+
+// ReduceRowTestBigSpace
+// 512 Obs, 1024 cols, 32 clusters, double precision
+const std::vector<ReduceRowsInputs<double>> inputsd_big_space = {
+        {0.00000001, 512, 1024, 40, 1234ULL, false},
+        {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0},
+        {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}};
+typedef ReduceRowTest<double> ReduceRowTestBigSpace;
+TEST_P(ReduceRowTestBigSpace, Result)
+{
+ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                              out.data(),
+                              params.cols * params.nkeys,
+                              raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+        ReduceRowTestBigSpace,
+        ::testing::ValuesIn(inputsd_big_space));
+
+// ReduceRowTestManyObs
+// 100000 Obs, 37 cols, 32 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf_many_obs = {
+        {0.00001f, 100000, 37, 32, 1234ULL, false},
+        {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0},
+        {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}};
+typedef ReduceRowTest<float> ReduceRowTestManyObs;
+TEST_P(ReduceRowTestManyObs, Result)
+{
+ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                              out.data(),
+                              params.cols * params.nkeys,
+                              raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+        ReduceRowTestManyObs,
+        ::testing::ValuesIn(inputsf_many_obs));
+
+// ReduceRowTestManyClusters
+// 100000 Obs, 37 cols, 2048 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf_many_cluster = {
+        {0.00001f, 100000, 37, 2048, 1234ULL, false},
+        {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0},
+        {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}};
+typedef ReduceRowTest<float> ReduceRowTestManyClusters;
+TEST_P(ReduceRowTestManyClusters, Result)
+{
+ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                              out.data(),
+                              params.cols * params.nkeys,
+                              raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+        ReduceRowTestManyClusters,
+        ::testing::ValuesIn(inputsf_many_cluster));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
new file mode 100644
index 0000000000..260ea07268
--- /dev/null
+++ b/cpp/test/linalg/rsvd.cu
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/linalg/rsvd.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+    namespace linalg {
+
+        template <typename T>
+        struct RsvdInputs {
+            T tolerance;
+            int n_row;
+            int n_col;
+            T PC_perc;
+            T UpS_perc;
+            int k;
+            int p;
+            bool use_bbt;
+            unsigned long long int seed;
+        };
+
+        template <typename T>
+        ::std::ostream& operator<<(::std::ostream& os, const RsvdInputs<T>& dims)
+        {
+            return os;
+        }
+
+        template <typename T>
+        class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
+        protected:
+            RsvdTest()
+                    : A(0, stream),
+                      U(0, stream),
+                      S(0, stream),
+                      V(0, stream),
+                      left_eig_vectors_ref(0, stream),
+                      right_eig_vectors_ref(0, stream),
+                      sing_vals_ref(0, stream)
+            {
+            }
+
+            void SetUp() override
+            {
+                raft::handle_t handle;
+                stream = handle.get_stream();
+
+                params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
+                // rSVD seems to be very sensitive to the random number sequence as well!
+                raft::random::Rng r(params.seed, raft::random::GenTaps);
+                int m = params.n_row, n = params.n_col;
+                T eig_svd_tol  = 1.e-7;
+                int max_sweeps = 100;
+
+                T mu = 0.0, sigma = 1.0;
+                A.resize(m * n, stream);
+                if (params.tolerance > 1) {  // Sanity check
+                    ASSERT(m == 3, "This test only supports mxn=3x2!");
+                    ASSERT(m * n == 6, "This test only supports mxn=3x2!");
+                    T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
+                    raft::update_device(A.data(), data_h, m * n, stream);
+
+                    T left_eig_vectors_ref_h[]  = {-0.308219, -0.906133, -0.289695};
+                    T right_eig_vectors_ref_h[] = {-0.638636, -0.769509};
+                    T sing_vals_ref_h[]         = {7.065283};
+
+                    left_eig_vectors_ref.resize(m, stream);
+                    right_eig_vectors_ref.resize(n, stream);
+                    sing_vals_ref.resize(1, stream);
+
+                    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream);
+                    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
+                    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
+
+                } else {  // Other normal tests
+                    r.normal(A.data(), m * n, mu, sigma, stream);
+                }
+                std::vector<T> A_backup_cpu(m *
+                                            n);  // Backup A matrix as svdJacobi will destroy the content of A
+                raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream);
+
+                if (params.k == 0) {
+                    params.k = max((int)(min(m, n) * params.PC_perc), 1);
+                    params.p = max((int)(min(m, n) * params.UpS_perc), 1);
+                }
+
+                U.resize(m * params.k, stream);
+                S.resize(params.k, stream);
+                V.resize(n * params.k, stream);
+                RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream));
+                RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream));
+                RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream));
+
+                // RSVD tests
+                if (params.k == 0) {  // Test with PC and upsampling ratio
+                    rsvdPerc(handle,
+                             A.data(),
+                             m,
+                             n,
+                             S.data(),
+                             U.data(),
+                             V.data(),
+                             params.PC_perc,
+                             params.UpS_perc,
+                             params.use_bbt,
+                             true,
+                             true,
+                             false,
+                             eig_svd_tol,
+                             max_sweeps,
+                             stream);
+                } else {  // Test with directly given fixed rank
+                    rsvdFixedRank(handle,
+                                  A.data(),
+                                  m,
+                                  n,
+                                  S.data(),
+                                  U.data(),
+                                  V.data(),
+                                  params.k,
+                                  params.p,
+                                  params.use_bbt,
+                                  true,
+                                  true,
+                                  true,
+                                  eig_svd_tol,
+                                  max_sweeps,
+                                  stream);
+                }
+                raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream);
+            }
+
+        protected:
+            cudaStream_t stream = 0;
+            RsvdInputs<T> params;
+            rmm::device_uvector<T> A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
+        };
+
+        const std::vector<RsvdInputs<float>> inputs_fx = {
+                // Test with ratios
+                {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
+                {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
+                {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
+                {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
+                {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+                {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+                {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+                {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+                ,                                                         // Test with fixed ranks
+                {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
+                {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
+                {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
+                {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
+                {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+                {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+                {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
+                {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
+        };
+
+        const std::vector<RsvdInputs<double>> inputs_dx = {
+                // Test with ratios
+                {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+                {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+                {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+                {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+                {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+                {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+                {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+                {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+                ,                                                      // Test with fixed ranks
+                {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
+                {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
+                {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
+                {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
+                {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
+                {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
+                {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
+                {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
+        };
+
+        const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
+                {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
+                {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
+                {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
+                {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
+
+        const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
+                {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
+                {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
+                {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
+                {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
+
+        typedef RsvdTest<float> RsvdSanityCheckValF;
+        TEST_P(RsvdSanityCheckValF, Result)
+    {
+        ASSERT_TRUE(devArrMatch(
+                sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<float>(params.tolerance)));
+    }
+
+    typedef RsvdTest<double> RsvdSanityCheckValD;
+    TEST_P(RsvdSanityCheckValD, Result)
+{
+    ASSERT_TRUE(devArrMatch(
+            sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef RsvdTest<float> RsvdSanityCheckLeftVecF;
+TEST_P(RsvdSanityCheckLeftVecF, Result)
+{
+ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
+                        U.data(),
+                        params.n_row * params.k,
+                        raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef RsvdTest<double> RsvdSanityCheckLeftVecD;
+TEST_P(RsvdSanityCheckLeftVecD, Result)
+{
+ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
+                        U.data(),
+                        params.n_row * params.k,
+                        raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef RsvdTest<float> RsvdSanityCheckRightVecF;
+TEST_P(RsvdSanityCheckRightVecF, Result)
+{
+ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
+                        V.data(),
+                        params.n_col * params.k,
+                        raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef RsvdTest<double> RsvdSanityCheckRightVecD;
+TEST_P(RsvdSanityCheckRightVecD, Result)
+{
+ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
+                        V.data(),
+                        params.n_col * params.k,
+                        raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef RsvdTest<float> RsvdTestSquareMatrixNormF;
+TEST_P(RsvdTestSquareMatrixNormF, Result)
+{
+raft::handle_t handle;
+
+ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
+                                              A.data(),
+                                              U.data(),
+                                              S.data(),
+                                              V.data(),
+                                              params.n_row,
+                                              params.n_col,
+                                              params.k,
+                                              4 * params.tolerance,
+                                              handle.get_stream()));
+}
+
+typedef RsvdTest<double> RsvdTestSquareMatrixNormD;
+TEST_P(RsvdTestSquareMatrixNormD, Result)
+{
+raft::handle_t handle;
+
+ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
+                                              A.data(),
+                                              U.data(),
+                                              S.data(),
+                                              V.data(),
+                                              params.n_row,
+                                              params.n_col,
+                                              params.k,
+                                              4 * params.tolerance,
+                                              handle.get_stream()));
+}
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValF, ::testing::ValuesIn(sanity_inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValD, ::testing::ValuesIn(sanity_inputs_dx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckLeftVecF, ::testing::ValuesIn(sanity_inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckLeftVecD, ::testing::ValuesIn(sanity_inputs_dx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckRightVecF, ::testing::ValuesIn(sanity_inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckRightVecD, ::testing::ValuesIn(sanity_inputs_dx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormF, ::testing::ValuesIn(inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormD, ::testing::ValuesIn(inputs_dx));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
new file mode 100644
index 0000000000..bf64d264ad
--- /dev/null
+++ b/cpp/test/linalg/sqrt.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <linalg/sqrt.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+    namespace linalg {
+
+        template <typename Type>
+        __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
+        {
+            int idx = threadIdx.x + blockIdx.x * blockDim.x;
+            if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); }
+        }
+
+        template <typename Type>
+        void naiveSqrtElem(Type* out, const Type* in1, int len)
+        {
+            static const int TPB = 64;
+            int nblks            = raft::ceildiv(len, TPB);
+            naiveSqrtElemKernel<Type><<<nblks, TPB>>>(out, in1, len);
+            RAFT_CUDA_TRY(cudaPeekAtLastError());
+        }
+
+        template <typename T>
+        struct SqrtInputs {
+            T tolerance;
+            int len;
+            unsigned long long int seed;
+        };
+
+        template <typename T>
+        ::std::ostream& operator<<(::std::ostream& os, const SqrtInputs<T>& dims)
+        {
+            return os;
+        }
+
+        template <typename T>
+        class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
+        protected:
+            SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {}
+
+            void SetUp() override
+            {
+                params = ::testing::TestWithParam<SqrtInputs<T>>::GetParam();
+                raft::random::Rng r(params.seed);
+                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+                int len = params.len;
+                in1.resize(len, stream);
+                out_ref.resize(len, stream);
+                out.resize(len, stream);
+                r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
+
+                naiveSqrtElem(out_ref.data(), in1.data(), len);
+
+                sqrt(out.data(), in1.data(), len, stream);
+                sqrt(in1.data(), in1.data(), len, stream);
+                RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+            }
+
+        protected:
+            cudaStream_t stream = 0;
+            SqrtInputs<T> params;
+            rmm::device_uvector<T> in1, out_ref, out;
+            int device_count = 0;
+        };
+
+        const std::vector<SqrtInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+
+        const std::vector<SqrtInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+
+        typedef SqrtTest<float> SqrtTestF;
+        TEST_P(SqrtTestF, Result)
+    {
+        ASSERT_TRUE(raft::devArrMatch(
+                out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+
+        ASSERT_TRUE(raft::devArrMatch(
+                out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+    }
+
+    typedef SqrtTest<double> SqrtTestD;
+    TEST_P(SqrtTestD, Result)
+{
+    ASSERT_TRUE(raft::devArrMatch(
+            out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+
+    ASSERT_TRUE(raft::devArrMatch(
+            out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestD, ::testing::ValuesIn(inputsd2));
+
+}  // end namespace LinAlg
+}  // end namespace MLCommon
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
new file mode 100644
index 0000000000..83ec3e6029
--- /dev/null
+++ b/cpp/test/linalg/ternary_op.cu
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/linalg/ternary_op.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+    namespace linalg {
+
+        template <typename InType, typename IdxType = int, typename OutType = InType>
+        struct BinaryOpInputs {
+            InType tolerance;
+            IdxType len;
+            unsigned long long int seed;
+        };
+
+        template <typename InType, typename IdxType = int, typename OutType = InType>
+        ::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
+        {
+            return os;
+        }
+
+        template <typename T>
+        class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
+        public:
+            ternaryOpTest()
+                    : params(::testing::TestWithParam<BinaryOpInputs<T>>::GetParam()),
+                      stream(handle.get_stream()),
+                      out_add_ref(params.len, stream),
+                      out_add(params.len, stream),
+                      out_mul_ref(params.len, stream),
+                      out_mul(params.len, stream)
+            {
+            }
+
+            void SetUp() override
+            {
+                raft::random::Rng rng(params.seed);
+                int len = params.len;
+                rmm::device_uvector<T> in1(len, stream);
+                rmm::device_uvector<T> in2(len, stream);
+                rmm::device_uvector<T> in3(len, stream);
+
+                rng.fill(out_add_ref.data(), len, T(6.0), stream);
+                rng.fill(out_mul_ref.data(), len, T(6.0), stream);
+                rng.fill(in1.data(), len, T(1.0), stream);
+                rng.fill(in2.data(), len, T(2.0), stream);
+                rng.fill(in3.data(), len, T(3.0), stream);
+
+                auto add = [] __device__(T a, T b, T c) { return a + b + c; };
+                auto mul = [] __device__(T a, T b, T c) { return a * b * c; };
+                ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream);
+                ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream);
+            }
+
+        protected:
+            BinaryOpInputs<T> params;
+            raft::handle_t handle;
+            cudaStream_t stream = 0;
+
+            rmm::device_uvector<T> out_add_ref, out_add, out_mul_ref, out_mul;
+        };
+
+        const std::vector<BinaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 1234ULL},
+                                                            {0.000001f, 1024 * 1024 + 2, 1234ULL},
+                                                            {0.000001f, 1024 * 1024 + 1, 1234ULL}};
+        typedef ternaryOpTest<float> ternaryOpTestF;
+        TEST_P(ternaryOpTestF, Result)
+    {
+        ASSERT_TRUE(devArrMatch(
+                out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+        ASSERT_TRUE(devArrMatch(
+                out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+    }
+    INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf));
+
+    const std::vector<BinaryOpInputs<double>> inputsd = {{0.00000001, 1024 * 1024, 1234ULL},
+                                                         {0.00000001, 1024 * 1024 + 2, 1234ULL},
+                                                         {0.00000001, 1024 * 1024 + 1, 1234ULL}};
+    typedef ternaryOpTest<double> ternaryOpTestD;
+    TEST_P(ternaryOpTestD, Result)
+{
+    ASSERT_TRUE(devArrMatch(
+            out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+    ASSERT_TRUE(devArrMatch(
+            out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft

From 6a0d70c48aba6ae36234b84c92dca3fc789dc47d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 18:53:04 -0500
Subject: [PATCH 02/24] Fixing style

---
 cpp/include/raft/linalg/detail/lstsq.hpp      | 740 +++++++++---------
 .../raft/linalg/detail/reduce_cols_by_key.cuh |  64 +-
 .../raft/linalg/detail/reduce_rows_by_key.cuh | 647 +++++++--------
 cpp/include/raft/linalg/detail/rsvd.cuh       | 634 +++++++--------
 cpp/include/raft/linalg/detail/ternary_op.cuh | 121 +--
 cpp/include/raft/linalg/lstsq.hpp             |   8 +-
 cpp/include/raft/linalg/power.cuh             |  23 +-
 .../raft/linalg/reduce_cols_by_key.cuh        |  12 +-
 .../raft/linalg/reduce_rows_by_key.cuh        |  44 +-
 cpp/include/raft/linalg/rsvd.cuh              |  66 +-
 cpp/include/raft/linalg/sqrt.cuh              |   4 +-
 cpp/include/raft/linalg/ternary_op.cuh        |  15 +-
 cpp/test/linalg/power.cu                      | 204 ++---
 cpp/test/linalg/reduce_cols_by_key.cu         | 172 ++--
 cpp/test/linalg/reduce_rows_by_key.cu         | 358 ++++-----
 cpp/test/linalg/rsvd.cu                       | 474 +++++------
 cpp/test/linalg/sqrt.cu                       | 166 ++--
 cpp/test/linalg/ternary_op.cu                 | 138 ++--
 18 files changed, 1975 insertions(+), 1915 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp
index e8aeccc9b0..c91d6e41c1 100644
--- a/cpp/include/raft/linalg/detail/lstsq.hpp
+++ b/cpp/include/raft/linalg/detail/lstsq.hpp
@@ -36,44 +36,49 @@
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
-    namespace linalg {
-        namespace detail {
+namespace linalg {
+namespace detail {
 
-            namespace {
+namespace {
 
 /** Operate a CUDA event if we're in the concurrent mode; no-op otherwise. */
-                struct DeviceEvent {
-                private:
-                    cudaEvent_t e;
-
-                public:
-                    DeviceEvent(bool concurrent) {
-                        if (concurrent)
-                            RAFT_CUDA_TRY(cudaEventCreate(&e));
-                        else
-                            e = nullptr;
-                    }
-
-                    ~DeviceEvent() {
-                        if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e));
-                    }
-
-                    operator cudaEvent_t() const { return e; }
-
-                    void record(cudaStream_t stream) {
-                        if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream));
-                    }
-
-                    void wait(cudaStream_t stream) {
-                        if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u));
-                    }
-
-                    void wait() {
-                        if (e != nullptr) raft::interruptible::synchronize(e);
-                    }
-
-                    DeviceEvent &operator=(const DeviceEvent &other) = delete;
-                };
+struct DeviceEvent {
+ private:
+  cudaEvent_t e;
+
+ public:
+  DeviceEvent(bool concurrent)
+  {
+    if (concurrent)
+      RAFT_CUDA_TRY(cudaEventCreate(&e));
+    else
+      e = nullptr;
+  }
+
+  ~DeviceEvent()
+  {
+    if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e));
+  }
+
+  operator cudaEvent_t() const { return e; }
+
+  void record(cudaStream_t stream)
+  {
+    if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream));
+  }
+
+  void wait(cudaStream_t stream)
+  {
+    if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u));
+  }
+
+  void wait()
+  {
+    if (e != nullptr) raft::interruptible::synchronize(e);
+  }
+
+  DeviceEvent& operator=(const DeviceEvent& other) = delete;
+};
 
 /**
  *  @brief Tells if the viewed CUDA stream is implicitly synchronized with the given stream.
@@ -82,35 +87,36 @@ namespace raft {
  *   if the two views point to the same stream
  *   or sometimes when one of them is the legacy default stream.
  */
-                bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b) {
-                    // any stream is "synchronized" with itself
-                    if (a.value() == b.value()) return true;
-                    // legacy + blocking streams
-                    unsigned int flags = 0;
-                    if (a.is_default()) {
-                        RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags));
-                        if ((flags & cudaStreamNonBlocking) == 0) return true;
-                    }
-                    if (b.is_default()) {
-                        RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags));
-                        if ((flags & cudaStreamNonBlocking) == 0) return true;
-                    }
-                    return false;
-                }
-
-                template<typename math_t>
-                struct DivideByNonZero {
-                    constexpr static const math_t
-                    eps = math_t(1e-10);
-
-                    __device__ math_t
-
-                    operator()(const math_t a, const math_t b) const {
-                        return raft::myAbs<math_t>(b) >= eps ? a / b : a;
-                    }
-                };
-
-            }  // namespace
+bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b)
+{
+  // any stream is "synchronized" with itself
+  if (a.value() == b.value()) return true;
+  // legacy + blocking streams
+  unsigned int flags = 0;
+  if (a.is_default()) {
+    RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags));
+    if ((flags & cudaStreamNonBlocking) == 0) return true;
+  }
+  if (b.is_default()) {
+    RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags));
+    if ((flags & cudaStreamNonBlocking) == 0) return true;
+  }
+  return false;
+}
+
+template <typename math_t>
+struct DivideByNonZero {
+  constexpr static const math_t eps = math_t(1e-10);
+
+  __device__ math_t
+
+  operator()(const math_t a, const math_t b) const
+  {
+    return raft::myAbs<math_t>(b) >= eps ? a / b : a;
+  }
+};
+
+}  // namespace
 
 /** Solves the linear ordinary least squares problem `Aw = b`
  *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
@@ -118,58 +124,59 @@ namespace raft {
  *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
  *             so it's not guaranteed to stay unmodified.
  */
-            template<typename math_t>
-            void lstsqSvdQR(const raft::handle_t &handle,
-                            math_t *A,
-                            const int n_rows,
-                            const int n_cols,
-                            const math_t *b,
-                            math_t *w,
-                            cudaStream_t stream) {
-                const int minmn = min(n_rows, n_cols);
-                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-                int cusolverWorkSetSize = 0;
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize<math_t>(
-                        cusolverH, n_rows, n_cols, &cusolverWorkSetSize));
-
-                rmm::device_uvector <math_t> workset(cusolverWorkSetSize  // cuSolver
-                                                     + n_rows * minmn   // U
-                                                     + n_cols * n_cols  // V
-                                                     + minmn            // S
-                                                     + minmn            // U^T * b
-                                                     + 1                // devInfo
-                        ,
-                                                     stream);
-                math_t *cusolverWorkSet = workset.data();
-                math_t *U = cusolverWorkSet + cusolverWorkSetSize;
-                math_t *Vt = U + n_rows * minmn;
-                math_t *S = Vt + n_cols * n_cols;
-                math_t *Ub = S + minmn;
-                int *devInfo = reinterpret_cast<int *>(Ub + minmn);
-
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd<math_t>(cusolverH,
-                                                                                'S',
-                                                                                'S',
-                                                                                n_rows,
-                                                                                n_cols,
-                                                                                A,
-                                                                                n_rows,
-                                                                                S,
-                                                                                U,
-                                                                                n_rows,
-                                                                                Vt,
-                                                                                n_cols,
-                                                                                cusolverWorkSet,
-                                                                                cusolverWorkSetSize,
-                                                                                nullptr,
-                                                                                devInfo,
-                                                                                stream));
-                raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
-                raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
-                raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream);
-            }
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
+{
+  const int minmn              = min(n_rows, n_cols);
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  int cusolverWorkSetSize      = 0;
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize<math_t>(
+    cusolverH, n_rows, n_cols, &cusolverWorkSetSize));
+
+  rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
+                                        + n_rows * minmn   // U
+                                        + n_cols * n_cols  // V
+                                        + minmn            // S
+                                        + minmn            // U^T * b
+                                        + 1                // devInfo
+                                      ,
+                                      stream);
+  math_t* cusolverWorkSet = workset.data();
+  math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
+  math_t* Vt              = U + n_rows * minmn;
+  math_t* S               = Vt + n_cols * n_cols;
+  math_t* Ub              = S + minmn;
+  int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd<math_t>(cusolverH,
+                                                                  'S',
+                                                                  'S',
+                                                                  n_rows,
+                                                                  n_cols,
+                                                                  A,
+                                                                  n_rows,
+                                                                  S,
+                                                                  U,
+                                                                  n_rows,
+                                                                  Vt,
+                                                                  n_cols,
+                                                                  cusolverWorkSet,
+                                                                  cusolverWorkSetSize,
+                                                                  nullptr,
+                                                                  devInfo,
+                                                                  stream));
+  raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+  raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+  raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream);
+}
 
 /** Solves the linear ordinary least squares problem `Aw = b`
  *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
@@ -177,159 +184,161 @@ namespace raft {
  *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
  *             so it's not guaranteed to stay unmodified.
  */
-            template<typename math_t>
-            void lstsqSvdJacobi(const raft::handle_t &handle,
-                                math_t *A,
-                                const int n_rows,
-                                const int n_cols,
-                                const math_t *b,
-                                math_t *w,
-                                cudaStream_t stream) {
-                const int minmn = min(n_rows, n_cols);
-                gesvdjInfo_t gesvdj_params;
-                RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params));
-                int cusolverWorkSetSize = 0;
-                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(
-                        raft::linalg::detail::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
-                                                                                  CUSOLVER_EIG_MODE_VECTOR,
-                                                                                  1,
-                                                                                  n_rows,
-                                                                                  n_cols,
-                                                                                  A,
-                                                                                  n_rows,
-                                                                                  nullptr,
-                                                                                  nullptr,
-                                                                                  n_rows,
-                                                                                  nullptr,
-                                                                                  n_cols,
-                                                                                  &cusolverWorkSetSize,
-                                                                                  gesvdj_params));
-                rmm::device_uvector <math_t> workset(cusolverWorkSetSize  // cuSolver
-                                                     + n_rows * minmn   // U
-                                                     + n_cols * minmn   // V
-                                                     + minmn            // S
-                                                     + minmn            // U^T * b
-                                                     + 1                // devInfo
-                        ,
-                                                     stream);
-                math_t *cusolverWorkSet = workset.data();
-                math_t *U = cusolverWorkSet + cusolverWorkSetSize;
-                math_t *V = U + n_rows * minmn;
-                math_t *S = V + n_cols * minmn;
-                math_t *Ub = S + minmn;
-                int *devInfo = reinterpret_cast<int *>(Ub + minmn);
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj<math_t>(cusolverH,
-                                                                                 CUSOLVER_EIG_MODE_VECTOR,
-                                                                                 1,
-                                                                                 n_rows,
-                                                                                 n_cols,
-                                                                                 A,
-                                                                                 n_rows,
-                                                                                 S,
-                                                                                 U,
-                                                                                 n_rows,
-                                                                                 V,
-                                                                                 n_cols,
-                                                                                 cusolverWorkSet,
-                                                                                 cusolverWorkSetSize,
-                                                                                 devInfo,
-                                                                                 gesvdj_params,
-                                                                                 stream));
-                raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
-                raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
-                raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream);
-            }
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+  const int minmn = min(n_rows, n_cols);
+  gesvdjInfo_t gesvdj_params;
+  RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  int cusolverWorkSetSize      = 0;
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(
+    raft::linalg::detail::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
+                                                              CUSOLVER_EIG_MODE_VECTOR,
+                                                              1,
+                                                              n_rows,
+                                                              n_cols,
+                                                              A,
+                                                              n_rows,
+                                                              nullptr,
+                                                              nullptr,
+                                                              n_rows,
+                                                              nullptr,
+                                                              n_cols,
+                                                              &cusolverWorkSetSize,
+                                                              gesvdj_params));
+  rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
+                                        + n_rows * minmn   // U
+                                        + n_cols * minmn   // V
+                                        + minmn            // S
+                                        + minmn            // U^T * b
+                                        + 1                // devInfo
+                                      ,
+                                      stream);
+  math_t* cusolverWorkSet = workset.data();
+  math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
+  math_t* V               = U + n_rows * minmn;
+  math_t* S               = V + n_cols * minmn;
+  math_t* Ub              = S + minmn;
+  int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj<math_t>(cusolverH,
+                                                                   CUSOLVER_EIG_MODE_VECTOR,
+                                                                   1,
+                                                                   n_rows,
+                                                                   n_cols,
+                                                                   A,
+                                                                   n_rows,
+                                                                   S,
+                                                                   U,
+                                                                   n_rows,
+                                                                   V,
+                                                                   n_cols,
+                                                                   cusolverWorkSet,
+                                                                   cusolverWorkSetSize,
+                                                                   devInfo,
+                                                                   gesvdj_params,
+                                                                   stream));
+  raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+  raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+  raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream);
+}
 
 /** Solves the linear ordinary least squares problem `Aw = b`
  *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
  *  (`w = (A^T A)^-1  A^T b`)
  */
-            template<typename math_t>
-            void lstsqEig(const raft::handle_t &handle,
-                          const math_t *A,
-                          const int n_rows,
-                          const int n_cols,
-                          const math_t *b,
-                          math_t *w,
-                          cudaStream_t stream) {
-                rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream);
-                rmm::cuda_stream_view multAbStream = mainStream;
-                bool concurrent = false;
-                {
-                    int sp_size = handle.get_stream_pool_size();
-                    if (sp_size > 0) {
-                        multAbStream = handle.get_stream_from_stream_pool(0);
-                        // check if the two streams can run concurrently
-                        if (!are_implicitly_synchronized(mainStream, multAbStream)) {
-                            concurrent = true;
-                        } else if (sp_size > 1) {
-                            mainStream = multAbStream;
-                            multAbStream = handle.get_stream_from_stream_pool(1);
-                            concurrent = true;
-                        }
-                    }
-                }
-                // the event is created only if the given raft handle is capable of running
-                // at least two CUDA streams without implicit synchronization.
-                DeviceEvent multAbDone(concurrent);
-
-                rmm::device_uvector <math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
-                math_t *Q = workset.data();
-                math_t *QS = Q + n_cols * n_cols;
-                math_t *covA = QS + n_cols * n_cols;
-                math_t *S = covA + n_cols * n_cols;
-                math_t *Ab = S + n_cols;
-
-                // covA <- A* A
-                math_t alpha = math_t(1);
-                math_t beta = math_t(0);
-                raft::linalg::gemm(handle,
-                                   A,
-                                   n_rows,
-                                   n_cols,
-                                   A,
-                                   covA,
-                                   n_cols,
-                                   n_cols,
-                                   CUBLAS_OP_T,
-                                   CUBLAS_OP_N,
-                                   alpha,
-                                   beta,
-                                   mainStream);
-
-                // Ab <- A* b
-                raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream);
-                multAbDone.record(multAbStream);
-
-                // Q S Q* <- covA
-                raft::common::nvtx::push_range("raft::linalg::eigDC");
-                raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
-                raft::common::nvtx::pop_range();
-
-                // QS  <- Q invS
-                raft::linalg::matrixVectorOp(
-                        QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero<math_t>(), mainStream);
-                // covA <- QS Q* == Q invS Q* == inv(A* A)
-                raft::linalg::gemm(handle,
-                                   QS,
-                                   n_cols,
-                                   n_cols,
-                                   Q,
-                                   covA,
-                                   n_cols,
-                                   n_cols,
-                                   CUBLAS_OP_N,
-                                   CUBLAS_OP_T,
-                                   alpha,
-                                   beta,
-                                   mainStream);
-                multAbDone.wait(mainStream);
-                // w <- covA Ab == Q invS Q* A b == inv(A* A) A b
-                raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream);
-            }
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+  rmm::cuda_stream_view mainStream   = rmm::cuda_stream_view(stream);
+  rmm::cuda_stream_view multAbStream = mainStream;
+  bool concurrent                    = false;
+  {
+    int sp_size = handle.get_stream_pool_size();
+    if (sp_size > 0) {
+      multAbStream = handle.get_stream_from_stream_pool(0);
+      // check if the two streams can run concurrently
+      if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+        concurrent = true;
+      } else if (sp_size > 1) {
+        mainStream   = multAbStream;
+        multAbStream = handle.get_stream_from_stream_pool(1);
+        concurrent   = true;
+      }
+    }
+  }
+  // the event is created only if the given raft handle is capable of running
+  // at least two CUDA streams without implicit synchronization.
+  DeviceEvent multAbDone(concurrent);
+
+  rmm::device_uvector<math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+  math_t* Q    = workset.data();
+  math_t* QS   = Q + n_cols * n_cols;
+  math_t* covA = QS + n_cols * n_cols;
+  math_t* S    = covA + n_cols * n_cols;
+  math_t* Ab   = S + n_cols;
+
+  // covA <- A* A
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  raft::linalg::gemm(handle,
+                     A,
+                     n_rows,
+                     n_cols,
+                     A,
+                     covA,
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_T,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     mainStream);
+
+  // Ab <- A* b
+  raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream);
+  multAbDone.record(multAbStream);
+
+  // Q S Q* <- covA
+  raft::common::nvtx::push_range("raft::linalg::eigDC");
+  raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
+  raft::common::nvtx::pop_range();
+
+  // QS  <- Q invS
+  raft::linalg::matrixVectorOp(
+    QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero<math_t>(), mainStream);
+  // covA <- QS Q* == Q invS Q* == inv(A* A)
+  raft::linalg::gemm(handle,
+                     QS,
+                     n_cols,
+                     n_cols,
+                     Q,
+                     covA,
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_T,
+                     alpha,
+                     beta,
+                     mainStream);
+  multAbDone.wait(mainStream);
+  // w <- covA Ab == Q invS Q* A b == inv(A* A) A b
+  raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream);
+}
 
 /** Solves the linear ordinary least squares problem `Aw = b`
  *  via QR decomposition of `A = QR`.
@@ -340,105 +349,106 @@ namespace raft {
  * @param b[in/out] - input target vector.
  *            Warning: the content of this vector is modified by the cuSOLVER routines.
  */
-            template<typename math_t>
-            void lstsqQR(const raft::handle_t &handle,
-                         math_t *A,
-                         const int n_rows,
-                         const int n_cols,
-                         math_t *b,
-                         math_t *w,
-                         cudaStream_t stream) {
-                cublasHandle_t cublasH = handle.get_cublas_handle();
-                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-
-                int m = n_rows;
-                int n = n_cols;
-
-                int info = 0;
-                rmm::device_uvector <math_t> d_tau(n, stream);
-                rmm::device_scalar<int> d_info(stream);
-
-                const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
-                const cublasOperation_t trans = CUBLAS_OP_T;
-
-                int lwork_geqrf = 0;
-                int lwork_ormqr = 0;
-                int lwork = 0;
-
-                const int lda = m;
-                const int ldb = m;
-
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(
-                        raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf));
-
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH,
-                                                                                   side,
-                                                                                   trans,
-                                                                                   m,
-                                                                                   1,
-                                                                                   n,
-                                                                                   A,
-                                                                                   lda,
-                                                                                   d_tau.data(),
-                                                                                   b,    // C,
-                                                                                   lda,  // ldc,
-                                                                                   &lwork_ormqr));
-
-                lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr;
-
-                rmm::device_uvector <math_t> d_work(lwork, stream);
-
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
-                        cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
-
-                RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-                RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-                ASSERT(0 == info, "lstsq.h: QR wasn't successful");
-
-                // #TODO: Call from public API when ready
-                RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH,
-                                                                        side,
-                                                                        trans,
-                                                                        m,
-                                                                        1,
-                                                                        n,
-                                                                        A,
-                                                                        lda,
-                                                                        d_tau.data(),
-                                                                        b,
-                                                                        ldb,
-                                                                        d_work.data(),
-                                                                        lwork,
-                                                                        d_info.data(),
-                                                                        stream));
-
-                RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-                RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-                ASSERT(0 == info, "lstsq.h: QR wasn't successful");
-
-                const math_t one = 1;
-
-                // #TODO: Call from public API when ready
-                RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH,
-                                                                 side,
-                                                                 CUBLAS_FILL_MODE_UPPER,
-                                                                 CUBLAS_OP_N,
-                                                                 CUBLAS_DIAG_NON_UNIT,
-                                                                 n,
-                                                                 1,
-                                                                 &one,
-                                                                 A,
-                                                                 lda,
-                                                                 b,
-                                                                 ldb,
-                                                                 stream));
-
-                RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream));
-            }
-
-        };  // namespace detail
-    };  // namespace linalg
+template <typename math_t>
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
+             math_t* b,
+             math_t* w,
+             cudaStream_t stream)
+{
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int m = n_rows;
+  int n = n_cols;
+
+  int info = 0;
+  rmm::device_uvector<math_t> d_tau(n, stream);
+  rmm::device_scalar<int> d_info(stream);
+
+  const cublasSideMode_t side   = CUBLAS_SIDE_LEFT;
+  const cublasOperation_t trans = CUBLAS_OP_T;
+
+  int lwork_geqrf = 0;
+  int lwork_ormqr = 0;
+  int lwork       = 0;
+
+  const int lda = m;
+  const int ldb = m;
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(
+    raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf));
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH,
+                                                                     side,
+                                                                     trans,
+                                                                     m,
+                                                                     1,
+                                                                     n,
+                                                                     A,
+                                                                     lda,
+                                                                     d_tau.data(),
+                                                                     b,    // C,
+                                                                     lda,  // ldc,
+                                                                     &lwork_ormqr));
+
+  lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr;
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
+    cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  ASSERT(0 == info, "lstsq.h: QR wasn't successful");
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH,
+                                                          side,
+                                                          trans,
+                                                          m,
+                                                          1,
+                                                          n,
+                                                          A,
+                                                          lda,
+                                                          d_tau.data(),
+                                                          b,
+                                                          ldb,
+                                                          d_work.data(),
+                                                          lwork,
+                                                          d_info.data(),
+                                                          stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  ASSERT(0 == info, "lstsq.h: QR wasn't successful");
+
+  const math_t one = 1;
+
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH,
+                                                   side,
+                                                   CUBLAS_FILL_MODE_UPPER,
+                                                   CUBLAS_OP_N,
+                                                   CUBLAS_DIAG_NON_UNIT,
+                                                   n,
+                                                   1,
+                                                   &one,
+                                                   A,
+                                                   lda,
+                                                   b,
+                                                   ldb,
+                                                   stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream));
+}
+
+};  // namespace detail
+};  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
index 307ed30c57..54cf9aa204 100644
--- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -22,25 +22,26 @@
 #include <stdlib.h>
 
 namespace raft {
-    namespace linalg {
-        namespace detail {
+namespace linalg {
+namespace detail {
 
 ///@todo: support col-major
 ///@todo: specialize this to support shared-mem based atomics
 
-        template<typename T, typename KeyIteratorT, typename IdxType>
-        __global__ void reduce_cols_by_key_kernel(
-                const T *data, const KeyIteratorT keys, T *out, IdxType nrows, IdxType ncols, IdxType nkeys) {
-            typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+template <typename T, typename KeyIteratorT, typename IdxType>
+__global__ void reduce_cols_by_key_kernel(
+  const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
+{
+  typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
 
-            IdxType idx = blockIdx.x * blockDim.x + threadIdx.x;
-            if (idx >= (nrows * ncols)) return;
-            ///@todo: yikes! use fast-int-div
-            IdxType colId = idx % ncols;
-            IdxType rowId = idx / ncols;
-            KeyType key = keys[colId];
-            raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]);
-        }
+  IdxType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= (nrows * ncols)) return;
+  ///@todo: yikes! use fast-int-div
+  IdxType colId = idx % ncols;
+  IdxType rowId = idx / ncols;
+  KeyType key   = keys[colId];
+  raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]);
+}
 
 /**
  * @brief Computes the sum-reduction of matrix columns for each given key
@@ -60,22 +61,23 @@ namespace raft {
  * @param nkeys number of unique keys in the keys array
  * @param stream cuda stream to launch the kernel onto
  */
-        template<typename T, typename KeyIteratorT, typename IdxType = int>
-        void reduce_cols_by_key(const T *data,
-                                const KeyIteratorT keys,
-                                T *out,
-                                IdxType nrows,
-                                IdxType ncols,
-                                IdxType nkeys,
-                                cudaStream_t stream) {
-            typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+template <typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T* data,
+                        const KeyIteratorT keys,
+                        T* out,
+                        IdxType nrows,
+                        IdxType ncols,
+                        IdxType nkeys,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
 
-            RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream));
-            constexpr int TPB = 256;
-            int nblks = (int) raft::ceildiv<IdxType>(nrows * ncols, TPB);
-            reduce_cols_by_key_kernel<<<nblks, TPB, 0, stream>>>(data, keys, out, nrows, ncols, nkeys);
-            RAFT_CUDA_TRY(cudaPeekAtLastError());
-        }
-    };  // end namespace detail
-    };  // end namespace linalg
+  RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream));
+  constexpr int TPB = 256;
+  int nblks         = (int)raft::ceildiv<IdxType>(nrows * ncols, TPB);
+  reduce_cols_by_key_kernel<<<nblks, TPB, 0, stream>>>(data, keys, out, nrows, ncols, nkeys);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+};  // end namespace detail
+};  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index f4fa892472..c88895807d 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -24,53 +24,56 @@
 
 #define MAX_BLOCKS 65535u
 namespace raft {
-    namespace linalg {
-        namespace detail {
+namespace linalg {
+namespace detail {
 
 //
 // Small helper function to convert from int->char and char->int
 // Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
 //
 
-            template<typename IteratorT1, typename IteratorT2>
-            void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) {
-                for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
-                    dst[idx] = src[idx];
-                }
-            }
+template <typename IteratorT1, typename IteratorT2>
+void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
+{
+  for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
+    dst[idx] = src[idx];
+  }
+}
 
-            template<typename IteratorT1, typename IteratorT2>
-            void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) {
-                dim3 grid, block;
-                block.x = 256;
+template <typename IteratorT1, typename IteratorT2>
+void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
+{
+  dim3 grid, block;
+  block.x = 256;
 
-                grid.x = raft::ceildiv(n, (int) block.x);
-                grid.x = std::min(grid.x, MAX_BLOCKS);
+  grid.x = raft::ceildiv(n, (int)block.x);
+  grid.x = std::min(grid.x, MAX_BLOCKS);
 
-                convert_array_kernel<<<grid, block, 0, st>>>(dst, src, n);
-            }
+  convert_array_kernel<<<grid, block, 0, st>>>(dst, src, n);
+}
 
-            template<typename T>
-            struct quad {
-                T x, y, z, w;
-            };
+template <typename T>
+struct quad {
+  T x, y, z, w;
+};
 
 //
 // Functor for reduce by key, small k
 //
-            template<typename T>
-            struct quadSum {
-                __host__ __device__ __forceinline__ quad<T> operator()(const quad<T> &a, const quad<T> &b) const {
-                    // wasting a double4..
-                    quad<T> c;
-                    c.x = a.x + b.x;
-                    c.y = a.y + b.y;
-                    c.z = a.z + b.z;
-                    c.w = a.w + b.w;
-
-                    return c;
-                }
-            };
+template <typename T>
+struct quadSum {
+  __host__ __device__ __forceinline__ quad<T> operator()(const quad<T>& a, const quad<T>& b) const
+  {
+    // wasting a double4..
+    quad<T> c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    c.z = a.z + b.z;
+    c.w = a.w + b.w;
+
+    return c;
+  }
+};
 
 //
 // Reduce by keys
@@ -84,92 +87,93 @@ namespace raft {
 
 #define SUM_ROWS_SMALL_K_DIMX         256
 #define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4
-            template<typename DataIteratorT, typename WeightT>
-            __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
-
-            __global__
-            void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
-                                                    int lda,
-                                                    const char *d_keys,
-                                                    const WeightT *d_weights,
-                                                    int nrows,
-                                                    int ncols,
-                                                    int nkeys,
-                                                    DataIteratorT d_sums) {
-                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-                typedef cub::BlockReduce<quad<DataType>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
-                __shared__ typename BlockReduce::TempStorage temp_storage;
-
-                for (int idim = static_cast<int>(blockIdx.y); idim < ncols; idim += gridDim.y) {
-                    if (idim != static_cast<int>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
-
-                    // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg
-                    quad<DataType> thread_sums;
-                    thread_sums.x = 0.0;
-                    thread_sums.y = 0.0;
-                    thread_sums.z = 0.0;
-                    thread_sums.w = 0.0;
-
-                    // May use vectorized load - not necessary for doubles
-                    for (int block_offset_irow = blockIdx.x * blockDim.x;
-                         block_offset_irow < nrows;  // we will syncthreads() inside the loop, no CTA divergence
-                         block_offset_irow += blockDim.x * gridDim.x) {
-                        int irow = block_offset_irow + threadIdx.x;
-                        DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0;
-                        if (d_weights && irow < nrows) { val = val * d_weights[irow]; }
-                        // we are not reusing the keys - after profiling
-                        // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded
-                        // (experimentation gave a 10% speed up - not worth the many code lines added)
-                        int row_key = (irow < nrows) ? d_keys[irow] : -1;
-
-                        thread_sums.x += (row_key == 0) ? val : 0.0;
-                        thread_sums.y += (row_key == 1) ? val : 0.0;
-                        thread_sums.z += (row_key == 2) ? val : 0.0;
-                        thread_sums.w += (row_key == 3) ? val : 0.0;
-                    }
-
-                    // End of column
-                    // Saving local sums back to global mem
-
-                    // Strided access
-
-                    // Reducing by key
-                    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<DataType>());
-
-                    if (threadIdx.x < 32) {
-                        // We only need 4
-                        thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff);
-                        if (static_cast<int>(threadIdx.x) < nkeys) {
-                            if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x);
-                            if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y);
-                            if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z);
-                            if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w);
-                        }
-                    }
-                }
-            }
-
-            template<typename DataIteratorT, typename WeightT>
-            void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
-                                             int lda,
-                                             const char *d_keys,
-                                             const WeightT *d_weights,
-                                             int nrows,
-                                             int ncols,
-                                             int nkeys,
-                                             DataIteratorT d_sums,
-                                             cudaStream_t st) {
-                dim3 grid, block;
-                block.x = SUM_ROWS_SMALL_K_DIMX;
-                block.y = 1;  // Necessary
-
-                grid.x = raft::ceildiv(nrows, (int) block.x);
-                grid.x = std::min(grid.x, 32u);
-                grid.y = ncols;
-                grid.y = std::min(grid.y, MAX_BLOCKS);
-                sum_rows_by_key_small_nkeys_kernel<<<grid, block, 0, st>>>(
-                        d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums);
-            }
+template <typename DataIteratorT, typename WeightT>
+__launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
+
+  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+                                                     int lda,
+                                                     const char* d_keys,
+                                                     const WeightT* d_weights,
+                                                     int nrows,
+                                                     int ncols,
+                                                     int nkeys,
+                                                     DataIteratorT d_sums)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  typedef cub::BlockReduce<quad<DataType>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idim = static_cast<int>(blockIdx.y); idim < ncols; idim += gridDim.y) {
+    if (idim != static_cast<int>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
+
+    // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg
+    quad<DataType> thread_sums;
+    thread_sums.x = 0.0;
+    thread_sums.y = 0.0;
+    thread_sums.z = 0.0;
+    thread_sums.w = 0.0;
+
+    // May use vectorized load - not necessary for doubles
+    for (int block_offset_irow = blockIdx.x * blockDim.x;
+         block_offset_irow < nrows;  // we will syncthreads() inside the loop, no CTA divergence
+         block_offset_irow += blockDim.x * gridDim.x) {
+      int irow     = block_offset_irow + threadIdx.x;
+      DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0;
+      if (d_weights && irow < nrows) { val = val * d_weights[irow]; }
+      // we are not reusing the keys - after profiling
+      // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded
+      // (experimentation gave a 10% speed up - not worth the many code lines added)
+      int row_key = (irow < nrows) ? d_keys[irow] : -1;
+
+      thread_sums.x += (row_key == 0) ? val : 0.0;
+      thread_sums.y += (row_key == 1) ? val : 0.0;
+      thread_sums.z += (row_key == 2) ? val : 0.0;
+      thread_sums.w += (row_key == 3) ? val : 0.0;
+    }
+
+    // End of column
+    // Saving local sums back to global mem
+
+    // Strided access
+
+    // Reducing by key
+    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<DataType>());
+
+    if (threadIdx.x < 32) {
+      // We only need 4
+      thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff);
+      if (static_cast<int>(threadIdx.x) < nkeys) {
+        if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x);
+        if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y);
+        if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z);
+        if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w);
+      }
+    }
+  }
+}
+
+template <typename DataIteratorT, typename WeightT>
+void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
+                                 int lda,
+                                 const char* d_keys,
+                                 const WeightT* d_weights,
+                                 int nrows,
+                                 int ncols,
+                                 int nkeys,
+                                 DataIteratorT d_sums,
+                                 cudaStream_t st)
+{
+  dim3 grid, block;
+  block.x = SUM_ROWS_SMALL_K_DIMX;
+  block.y = 1;  // Necessary
+
+  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = std::min(grid.x, 32u);
+  grid.y = ncols;
+  grid.y = std::min(grid.y, MAX_BLOCKS);
+  sum_rows_by_key_small_nkeys_kernel<<<grid, block, 0, st>>>(
+    d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums);
+}
 
 //
 // Reduce by keys - large number of keys
@@ -179,160 +183,163 @@ namespace raft {
 
 #define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024
 
-            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-            __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
-                                                                        int lda,
-                                                                        const KeysIteratorT d_keys,
-                                                                        const WeightT *d_weights,
-                                                                        int nrows,
-                                                                        int ncols,
-                                                                        int key_offset,
-                                                                        int nkeys,
-                                                                        DataIteratorT d_sums) {
-                typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-                __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
-
-                for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
-                    local_sums[local_key] = 0.0;
-
-                for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
-                    __syncthreads();  // local_sums
-
-                    // At this point local_sums if full of zeros
-
-                    for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
-                         irow += blockDim.x * gridDim.x) {
-                        // Branch div in this loop - not an issue with current code
-                        DataType val = d_A[idim * lda + irow];
-                        if (d_weights) val = val * d_weights[irow];
-
-                        int local_key = d_keys[irow] - key_offset;
-
-                        // We could load next val here
-                        raft::myAtomicAdd(&local_sums[local_key], val);
-                    }
-
-                    __syncthreads();  // local_sums
-
-                    for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
-                        DataType local_sum = local_sums[local_key];
-
-                        if (local_sum != 0.0) {
-                            KeyType global_key = key_offset + local_key;
-                            raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum);
-                            local_sums[local_key] = 0.0;
-                        }
-                    }
-                }
-            }
-
-            template<typename DataIteratorT, typename KeysIteratorT>
-            void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A,
-                                                      int lda,
-                                                      KeysIteratorT d_keys,
-                                                      int nrows,
-                                                      int ncols,
-                                                      int key_offset,
-                                                      int nkeys,
-                                                      DataIteratorT d_sums,
-                                                      cudaStream_t st) {
-                dim3 grid, block;
-                block.x = SUM_ROWS_SMALL_K_DIMX;
-                block.y = 1;  // Necessary
-
-                grid.x = raft::ceildiv(nrows, (int) block.x);
-                grid.x = std::min(grid.x, 32u);
-                grid.y = ncols;
-                grid.y = std::min(grid.y, MAX_BLOCKS);
-                sum_rows_by_key_large_nkeys_kernel_colmajor<<<grid, block, 0, st>>>(
-                        d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
-            }
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+                                                            int lda,
+                                                            const KeysIteratorT d_keys,
+                                                            const WeightT* d_weights,
+                                                            int nrows,
+                                                            int ncols,
+                                                            int key_offset,
+                                                            int nkeys,
+                                                            DataIteratorT d_sums)
+{
+  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
+
+  for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
+    local_sums[local_key] = 0.0;
+
+  for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
+    __syncthreads();  // local_sums
+
+    // At this point local_sums if full of zeros
+
+    for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
+         irow += blockDim.x * gridDim.x) {
+      // Branch div in this loop - not an issue with current code
+      DataType val = d_A[idim * lda + irow];
+      if (d_weights) val = val * d_weights[irow];
+
+      int local_key = d_keys[irow] - key_offset;
+
+      // We could load next val here
+      raft::myAtomicAdd(&local_sums[local_key], val);
+    }
+
+    __syncthreads();  // local_sums
+
+    for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
+      DataType local_sum = local_sums[local_key];
+
+      if (local_sum != 0.0) {
+        KeyType global_key = key_offset + local_key;
+        raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum);
+        local_sums[local_key] = 0.0;
+      }
+    }
+  }
+}
+
+template <typename DataIteratorT, typename KeysIteratorT>
+void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A,
+                                          int lda,
+                                          KeysIteratorT d_keys,
+                                          int nrows,
+                                          int ncols,
+                                          int key_offset,
+                                          int nkeys,
+                                          DataIteratorT d_sums,
+                                          cudaStream_t st)
+{
+  dim3 grid, block;
+  block.x = SUM_ROWS_SMALL_K_DIMX;
+  block.y = 1;  // Necessary
+
+  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = std::min(grid.x, 32u);
+  grid.y = ncols;
+  grid.y = std::min(grid.y, MAX_BLOCKS);
+  sum_rows_by_key_large_nkeys_kernel_colmajor<<<grid, block, 0, st>>>(
+    d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+}
 
 #define RRBK_SHMEM_SZ 32
 
 //#define RRBK_SHMEM
-            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-            __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
-                                                                        int lda,
-                                                                        const WeightT *d_weights,
-                                                                        KeysIteratorT d_keys,
-                                                                        int nrows,
-                                                                        int ncols,
-                                                                        int key_offset,
-                                                                        int nkeys,
-                                                                        DataIteratorT d_sums) {
-                typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+                                                            int lda,
+                                                            const WeightT* d_weights,
+                                                            KeysIteratorT d_keys,
+                                                            int nrows,
+                                                            int ncols,
+                                                            int key_offset,
+                                                            int nkeys,
+                                                            DataIteratorT d_sums)
+{
+  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
 
 #ifdef RRBK_SHMEM
-                __shared__ KeyType sh_keys[RRBK_SHMEM_SZ];
+  __shared__ KeyType sh_keys[RRBK_SHMEM_SZ];
 #endif
-                int rows_per_partition = nrows / gridDim.z + 1;
-                int start_row = blockIdx.z * rows_per_partition;
-                int end_row = start_row + rows_per_partition;
-                end_row = end_row > nrows ? nrows : end_row;
-
-                KeyType local_key = blockIdx.y;
-                if (local_key >= nkeys) return;
-                int this_col = threadIdx.x + blockIdx.x * blockDim.x;
-                if (this_col >= ncols) return;
-
-                DataType sum = 0.0;
-                KeyType global_key = key_offset + local_key;
+  int rows_per_partition = nrows / gridDim.z + 1;
+  int start_row          = blockIdx.z * rows_per_partition;
+  int end_row            = start_row + rows_per_partition;
+  end_row                = end_row > nrows ? nrows : end_row;
+
+  KeyType local_key = blockIdx.y;
+  if (local_key >= nkeys) return;
+  int this_col = threadIdx.x + blockIdx.x * blockDim.x;
+  if (this_col >= ncols) return;
+
+  DataType sum       = 0.0;
+  KeyType global_key = key_offset + local_key;
 #ifdef RRBK_SHMEM
-                int sh_key_inx = 0;
+  int sh_key_inx = 0;
 #endif
-                for (int r = start_row; r < end_row; r++) {
+  for (int r = start_row; r < end_row; r++) {
 #ifdef RRBK_SHMEM
-                    if (0 == sh_key_inx % RRBK_SHMEM_SZ) {
-          for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x)
-            sh_keys[x] = d_keys[r + x];
-          __syncthreads();
-        }
-        if (sh_keys[sh_key_inx] != global_key) continue;  // No divergence since global_key is the
-        // same for the whole block
-        sh_key_inx++;
+    if (0 == sh_key_inx % RRBK_SHMEM_SZ) {
+      for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x)
+        sh_keys[x] = d_keys[r + x];
+      __syncthreads();
+    }
+    if (sh_keys[sh_key_inx] != global_key) continue;  // No divergence since global_key is the
+    // same for the whole block
+    sh_key_inx++;
 #else
-                    if (d_keys[r] != global_key)
-                        continue;  // No divergence since global_key is the
-                    // same for the whole block
+    if (d_keys[r] != global_key) continue;  // No divergence since global_key is the
+                                            // same for the whole block
 #endif
-                    // if ((end_row-start_row) / (r-start_row) != global_key) continue;
-                    DataType val = __ldcg(&d_A[r * lda + this_col]);
-                    if (d_weights) { val = val * d_weights[r]; }
-                    sum += val;
-                }
-
-                if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum);
-            }
-
-            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-            void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A,
-                                                      int lda,
-                                                      const KeysIteratorT d_keys,
-                                                      const WeightT *d_weights,
-                                                      int nrows,
-                                                      int ncols,
-                                                      int key_offset,
-                                                      int nkeys,
-                                                      DataIteratorT d_sums,
-                                                      cudaStream_t st) {
-                // x-dim refers to the column in the input data
-                // y-dim refers to the key
-                // z-dim refers to a partitioning of the rows among the threadblocks
-                dim3 grid, block;
-                block.x = 256;  // Adjust me!
-                block.y = 1;    // Don't adjust me!
-                grid.x = raft::ceildiv(ncols, (int) block.x);
-                grid.y = nkeys;
-                grid.z = std::max(40960000 / nkeys / ncols, (int) 1);  // Adjust me!
-                grid.z = std::min(grid.z, (unsigned int) nrows);
-                grid.z = std::min(grid.z, MAX_BLOCKS);
-
-                sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid, block, 0, st>>>(
-                        d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
-            }
+    // if ((end_row-start_row) / (r-start_row) != global_key) continue;
+    DataType val = __ldcg(&d_A[r * lda + this_col]);
+    if (d_weights) { val = val * d_weights[r]; }
+    sum += val;
+  }
+
+  if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum);
+}
+
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A,
+                                          int lda,
+                                          const KeysIteratorT d_keys,
+                                          const WeightT* d_weights,
+                                          int nrows,
+                                          int ncols,
+                                          int key_offset,
+                                          int nkeys,
+                                          DataIteratorT d_sums,
+                                          cudaStream_t st)
+{
+  // x-dim refers to the column in the input data
+  // y-dim refers to the key
+  // z-dim refers to a partitioning of the rows among the threadblocks
+  dim3 grid, block;
+  block.x = 256;  // Adjust me!
+  block.y = 1;    // Don't adjust me!
+  grid.x  = raft::ceildiv(ncols, (int)block.x);
+  grid.y  = nkeys;
+  grid.z  = std::max(40960000 / nkeys / ncols, (int)1);  // Adjust me!
+  grid.z  = std::min(grid.z, (unsigned int)nrows);
+  grid.z  = std::min(grid.z, MAX_BLOCKS);
+
+  sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid, block, 0, st>>>(
+    d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+}
 
 /**
  * @brief Computes the weighted reduction of matrix rows for each given key
@@ -353,39 +360,40 @@ namespace raft {
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
  */
-            template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-            void reduce_rows_by_key(const DataIteratorT d_A,
-                                    int lda,
-                                    const KeysIteratorT d_keys,
-                                    const WeightT *d_weights,
-                                    char *d_keys_char,
-                                    int nrows,
-                                    int ncols,
-                                    int nkeys,
-                                    DataIteratorT d_sums,
-                                    cudaStream_t stream) {
-                typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-
-                // Following kernel needs memset
-                cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream);
-
-                if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
-                    // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW
-                    // with doubles we have ~20% speed up - with floats we can hope something around 2x
-                    // Converting d_keys to char
-                    convert_array(d_keys_char, d_keys, nrows, stream);
-                    sum_rows_by_key_small_nkeys(
-                            d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream);
-                } else {
-                    for (KeyType key_offset = 0; key_offset < static_cast<KeyType>(nkeys);
-                         key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) {
-                        KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys);
-                        sum_rows_by_key_large_nkeys_rowmajor(
-                                d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream);
-                    }
-                }
-            }
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        const WeightT* d_weights,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+
+  // Following kernel needs memset
+  cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream);
+
+  if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
+    // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW
+    // with doubles we have ~20% speed up - with floats we can hope something around 2x
+    // Converting d_keys to char
+    convert_array(d_keys_char, d_keys, nrows, stream);
+    sum_rows_by_key_small_nkeys(
+      d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream);
+  } else {
+    for (KeyType key_offset = 0; key_offset < static_cast<KeyType>(nkeys);
+         key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) {
+      KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys);
+      sum_rows_by_key_large_nkeys_rowmajor(
+        d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream);
+    }
+  }
+}
 
 /**
  * @brief Computes the reduction of matrix rows for each given key
@@ -403,29 +411,30 @@ namespace raft {
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
  */
-            template<typename DataIteratorT, typename KeysIteratorT>
-            void reduce_rows_by_key(const DataIteratorT d_A,
-                                    int lda,
-                                    const KeysIteratorT d_keys,
-                                    char *d_keys_char,
-                                    int nrows,
-                                    int ncols,
-                                    int nkeys,
-                                    DataIteratorT d_sums,
-                                    cudaStream_t stream) {
-                typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-                reduce_rows_by_key(d_A,
-                                   lda,
-                                   d_keys,
-                                   static_cast<DataType *>(nullptr),
-                                   d_keys_char,
-                                   nrows,
-                                   ncols,
-                                   nkeys,
-                                   d_sums,
-                                   stream);
-            }
-
-        };  // end namespace detail
-    };  // end namespace linalg
+template <typename DataIteratorT, typename KeysIteratorT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  reduce_rows_by_key(d_A,
+                     lda,
+                     d_keys,
+                     static_cast<DataType*>(nullptr),
+                     d_keys_char,
+                     nrows,
+                     ncols,
+                     nkeys,
+                     d_sums,
+                     stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index 700ce43735..88436eda64 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -27,8 +27,8 @@
 #include <raft/random/rng.hpp>
 
 namespace raft {
-    namespace linalg {
-        namespace detail {
+namespace linalg {
+namespace detail {
 
 /**
  * @brief randomized singular value decomposition (RSVD) on the column major
@@ -51,302 +51,301 @@ namespace raft {
  * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
  * @param stream cuda stream
  */
-            template<typename math_t>
-            void rsvdFixedRank(const raft::handle_t &handle,
-                               math_t *M,
-                               int n_rows,
-                               int n_cols,
-                               math_t *S_vec,
-                               math_t *U,
-                               math_t *V,
-                               int k,
-                               int p,
-                               bool use_bbt,
-                               bool gen_left_vec,
-                               bool gen_right_vec,
-                               bool use_jacobi,
-                               math_t tol,
-                               int max_sweeps,
-                               cudaStream_t stream) {
-                cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-                cublasHandle_t cublasH = handle.get_cublas_handle();
+template <typename math_t>
+void rsvdFixedRank(const raft::handle_t& handle,
+                   math_t* M,
+                   int n_rows,
+                   int n_cols,
+                   math_t* S_vec,
+                   math_t* U,
+                   math_t* V,
+                   int k,
+                   int p,
+                   bool use_bbt,
+                   bool gen_left_vec,
+                   bool gen_right_vec,
+                   bool use_jacobi,
+                   math_t tol,
+                   int max_sweeps,
+                   cudaStream_t stream)
+{
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
 
-                // All the notations are following Algorithm 4 & 5 in S. Voronin's paper:
-                // https://arxiv.org/abs/1502.05366
+  // All the notations are following Algorithm 4 & 5 in S. Voronin's paper:
+  // https://arxiv.org/abs/1502.05366
 
-                int m = n_rows, n = n_cols;
-                int l = k + p;  // Total number of singular values to be computed before truncation
-                int q = 2;      // Number of power sampling counts
-                int s = 1;      // Frequency controller for QR decomposition during power sampling
-                // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s
-                // > 2: less frequent QR
+  int m = n_rows, n = n_cols;
+  int l = k + p;  // Total number of singular values to be computed before truncation
+  int q = 2;      // Number of power sampling counts
+  int s = 1;      // Frequency controller for QR decomposition during power sampling
+  // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s
+  // > 2: less frequent QR
 
-                const math_t alpha = 1.0, beta = 0.0;
+  const math_t alpha = 1.0, beta = 0.0;
 
-                // Build temporary U, S, V matrices
-                rmm::device_uvector <math_t> S_vec_tmp(l, stream);
-                RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+  // Build temporary U, S, V matrices
+  rmm::device_uvector<math_t> S_vec_tmp(l, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
 
-                // build random matrix
-                rmm::device_uvector <math_t> RN(n * l, stream);
-                raft::random::Rng rng(484);
-                rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream);
+  // build random matrix
+  rmm::device_uvector<math_t> RN(n * l, stream);
+  raft::random::Rng rng(484);
+  rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream);
 
-                // multiply to get matrix of random samples Y
-                rmm::device_uvector <math_t> Y(m * l, stream);
-                raft::linalg::gemm(
-                        handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+  // multiply to get matrix of random samples Y
+  rmm::device_uvector<math_t> Y(m * l, stream);
+  raft::linalg::gemm(
+    handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
 
-                // now build up (M M^T)^q R
-                rmm::device_uvector <math_t> Z(n * l, stream);
-                rmm::device_uvector <math_t> Yorth(m * l, stream);
-                rmm::device_uvector <math_t> Zorth(n * l, stream);
-                RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
-                RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
-                RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+  // now build up (M M^T)^q R
+  rmm::device_uvector<math_t> Z(n * l, stream);
+  rmm::device_uvector<math_t> Yorth(m * l, stream);
+  rmm::device_uvector<math_t> Zorth(n * l, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
 
-                // power sampling scheme
-                for (int j = 1; j < q; j++) {
-                    if ((2 * j - 2) % s == 0) {
-                        raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream);
-                        raft::linalg::gemm(handle,
-                                           M,
-                                           m,
-                                           n,
-                                           Yorth.data(),
-                                           Z.data(),
-                                           n,
-                                           l,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                    } else {
-                        raft::linalg::gemm(
-                                handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
-                                stream);
-                    }
+  // power sampling scheme
+  for (int j = 1; j < q; j++) {
+    if ((2 * j - 2) % s == 0) {
+      raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream);
+      raft::linalg::gemm(handle,
+                         M,
+                         m,
+                         n,
+                         Yorth.data(),
+                         Z.data(),
+                         n,
+                         l,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    } else {
+      raft::linalg::gemm(
+        handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+    }
 
-                    if ((2 * j - 1) % s == 0) {
-                        raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream);
-                        raft::linalg::gemm(handle,
-                                           M,
-                                           m,
-                                           n,
-                                           Zorth.data(),
-                                           Y.data(),
-                                           m,
-                                           l,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                    } else {
-                        raft::linalg::gemm(
-                                handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta,
-                                stream);
-                    }
-                }
+    if ((2 * j - 1) % s == 0) {
+      raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream);
+      raft::linalg::gemm(handle,
+                         M,
+                         m,
+                         n,
+                         Zorth.data(),
+                         Y.data(),
+                         m,
+                         l,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    } else {
+      raft::linalg::gemm(
+        handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+    }
+  }
 
-                // orthogonalize on exit from loop to get Q
-                rmm::device_uvector <math_t> Q(m * l, stream);
-                RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream));
-                raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream);
+  // orthogonalize on exit from loop to get Q
+  rmm::device_uvector<math_t> Q(m * l, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream));
+  raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream);
 
-                // either QR of B^T method, or eigendecompose BB^T method
-                if (!use_bbt) {
-                    // form Bt = Mt*Q : nxm * mxl = nxl
-                    rmm::device_uvector <math_t> Bt(n * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
-                    raft::linalg::gemm(
-                            handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+  // either QR of B^T method, or eigendecompose BB^T method
+  if (!use_bbt) {
+    // form Bt = Mt*Q : nxm * mxl = nxl
+    rmm::device_uvector<math_t> Bt(n * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+    raft::linalg::gemm(
+      handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
 
-                    // compute QR factorization of Bt
-                    // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */
-                    rmm::device_uvector <math_t> Qhat(n * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
-                    rmm::device_uvector <math_t> Rhat(l * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
-                    raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream);
+    // compute QR factorization of Bt
+    // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */
+    rmm::device_uvector<math_t> Qhat(n * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+    rmm::device_uvector<math_t> Rhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+    raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream);
 
-                    // compute SVD of Rhat (lxl)
-                    rmm::device_uvector <math_t> Uhat(l * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
-                    rmm::device_uvector <math_t> Vhat(l * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
-                    if (use_jacobi)
-                        raft::linalg::svdJacobi(handle,
-                                                Rhat.data(),
-                                                l,
-                                                l,
-                                                S_vec_tmp.data(),
-                                                Uhat.data(),
-                                                Vhat.data(),
-                                                true,
-                                                true,
-                                                tol,
-                                                max_sweeps,
-                                                stream);
-                    else
-                        raft::linalg::svdQR(handle,
-                                            Rhat.data(),
-                                            l,
-                                            l,
-                                            S_vec_tmp.data(),
-                                            Uhat.data(),
-                                            Vhat.data(),
-                                            true,
-                                            true,
-                                            true,
-                                            stream);
-                    raft::matrix::sliceMatrix(S_vec_tmp.data(),
-                                              1,
-                                              l,
-                                              S_vec,
-                                              0,
-                                              0,
-                                              1,
-                                              k,
-                                              stream);  // First k elements of S_vec
+    // compute SVD of Rhat (lxl)
+    rmm::device_uvector<math_t> Uhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    rmm::device_uvector<math_t> Vhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (use_jacobi)
+      raft::linalg::svdJacobi(handle,
+                              Rhat.data(),
+                              l,
+                              l,
+                              S_vec_tmp.data(),
+                              Uhat.data(),
+                              Vhat.data(),
+                              true,
+                              true,
+                              tol,
+                              max_sweeps,
+                              stream);
+    else
+      raft::linalg::svdQR(handle,
+                          Rhat.data(),
+                          l,
+                          l,
+                          S_vec_tmp.data(),
+                          Uhat.data(),
+                          Vhat.data(),
+                          true,
+                          true,
+                          true,
+                          stream);
+    raft::matrix::sliceMatrix(S_vec_tmp.data(),
+                              1,
+                              l,
+                              S_vec,
+                              0,
+                              0,
+                              1,
+                              k,
+                              stream);  // First k elements of S_vec
 
-                    // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk
-                    if (gen_left_vec) {
-                        raft::linalg::gemm(handle,
-                                           Q.data(),
-                                           m,
-                                           l,
-                                           Vhat.data(),
-                                           U,
-                                           m,
-                                           k /*used to be l and needs slicing*/,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                    }
+    // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk
+    if (gen_left_vec) {
+      raft::linalg::gemm(handle,
+                         Q.data(),
+                         m,
+                         l,
+                         Vhat.data(),
+                         U,
+                         m,
+                         k /*used to be l and needs slicing*/,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    }
 
-                    // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk
-                    if (gen_right_vec) {
-                        raft::linalg::gemm(handle,
-                                           Qhat.data(),
-                                           n,
-                                           l,
-                                           Uhat.data(),
-                                           V,
-                                           n,
-                                           k /*used to be l and needs slicing*/,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                    }
-                } else {
-                    // build the matrix B B^T = Q^T M M^T Q column by column
-                    // Bt = M^T Q ; nxm * mxk = nxk
-                    rmm::device_uvector <math_t> B(n * l, stream);
-                    raft::linalg::gemm(
-                            handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+    // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk
+    if (gen_right_vec) {
+      raft::linalg::gemm(handle,
+                         Qhat.data(),
+                         n,
+                         l,
+                         Uhat.data(),
+                         V,
+                         n,
+                         k /*used to be l and needs slicing*/,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    }
+  } else {
+    // build the matrix B B^T = Q^T M M^T Q column by column
+    // Bt = M^T Q ; nxm * mxk = nxk
+    rmm::device_uvector<math_t> B(n * l, stream);
+    raft::linalg::gemm(
+      handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
 
-                    rmm::device_uvector <math_t> BBt(l * l, stream);
-                    raft::linalg::gemm(handle,
-                                       B.data(),
-                                       l,
-                                       n,
-                                       B.data(),
-                                       BBt.data(),
-                                       l,
-                                       l,
-                                       CUBLAS_OP_N,
-                                       CUBLAS_OP_T,
-                                       alpha,
-                                       beta,
-                                       stream);
+    rmm::device_uvector<math_t> BBt(l * l, stream);
+    raft::linalg::gemm(handle,
+                       B.data(),
+                       l,
+                       n,
+                       B.data(),
+                       BBt.data(),
+                       l,
+                       l,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       alpha,
+                       beta,
+                       stream);
 
-                    // compute eigendecomposition of BBt
-                    rmm::device_uvector <math_t> Uhat(l * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
-                    rmm::device_uvector <math_t> Uhat_dup(l * l, stream);
-                    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
-                    raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream);
-                    if (use_jacobi)
-                        raft::linalg::eigJacobi(
-                                handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps);
-                    else
-                        raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream);
-                    raft::matrix::seqRoot(S_vec_tmp.data(), l, stream);
-                    raft::matrix::sliceMatrix(S_vec_tmp.data(),
-                                              1,
-                                              l,
-                                              S_vec,
-                                              0,
-                                              p,
-                                              1,
-                                              l,
-                                              stream);  // Last k elements of S_vec
-                    raft::matrix::colReverse(S_vec, 1, k, stream);
+    // compute eigendecomposition of BBt
+    rmm::device_uvector<math_t> Uhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    rmm::device_uvector<math_t> Uhat_dup(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+    raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream);
+    if (use_jacobi)
+      raft::linalg::eigJacobi(
+        handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps);
+    else
+      raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream);
+    raft::matrix::seqRoot(S_vec_tmp.data(), l, stream);
+    raft::matrix::sliceMatrix(S_vec_tmp.data(),
+                              1,
+                              l,
+                              S_vec,
+                              0,
+                              p,
+                              1,
+                              l,
+                              stream);  // Last k elements of S_vec
+    raft::matrix::colReverse(S_vec, 1, k, stream);
 
-                    // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk
-                    if (gen_left_vec) {
-                        raft::linalg::gemm(handle,
-                                           Q.data(),
-                                           m,
-                                           l,
-                                           Uhat.data() + p * l,
-                                           U,
-                                           m,
-                                           k,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                        raft::matrix::colReverse(U, m, k, stream);
-                    }
+    // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk
+    if (gen_left_vec) {
+      raft::linalg::gemm(handle,
+                         Q.data(),
+                         m,
+                         l,
+                         Uhat.data() + p * l,
+                         U,
+                         m,
+                         k,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+      raft::matrix::colReverse(U, m, k, stream);
+    }
 
-                    // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] *
-                    // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk
-                    if (gen_right_vec) {
-                        rmm::device_uvector <math_t> Sinv(k * k, stream);
-                        RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
-                        rmm::device_uvector <math_t> UhatSinv(l * k, stream);
-                        RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
-                        raft::matrix::reciprocal(S_vec_tmp.data(), l, stream);
-                        raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream);
+    // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] *
+    // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk
+    if (gen_right_vec) {
+      rmm::device_uvector<math_t> Sinv(k * k, stream);
+      RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+      rmm::device_uvector<math_t> UhatSinv(l * k, stream);
+      RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+      raft::matrix::reciprocal(S_vec_tmp.data(), l, stream);
+      raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream);
 
-                        raft::linalg::gemm(handle,
-                                           Uhat.data() + p * l,
-                                           l,
-                                           k,
-                                           Sinv.data(),
-                                           UhatSinv.data(),
-                                           l,
-                                           k,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                        raft::linalg::gemm(handle,
-                                           B.data(),
-                                           l,
-                                           n,
-                                           UhatSinv.data(),
-                                           V,
-                                           n,
-                                           k,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           alpha,
-                                           beta,
-                                           stream);
-                        raft::matrix::colReverse(V, n, k, stream);
-                    }
-                }
-            }
+      raft::linalg::gemm(handle,
+                         Uhat.data() + p * l,
+                         l,
+                         k,
+                         Sinv.data(),
+                         UhatSinv.data(),
+                         l,
+                         k,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+      raft::linalg::gemm(handle,
+                         B.data(),
+                         l,
+                         n,
+                         UhatSinv.data(),
+                         V,
+                         n,
+                         k,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+      raft::matrix::colReverse(V, n, k, stream);
+    }
+  }
+}
 
 /**
  * @brief randomized singular value decomposition (RSVD) on the column major
@@ -369,44 +368,45 @@ namespace raft {
  * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
  * @param stream cuda stream
  */
-            template<typename math_t>
-            void rsvdPerc(const raft::handle_t &handle,
-                          math_t *M,
-                          int n_rows,
-                          int n_cols,
-                          math_t *S_vec,
-                          math_t *U,
-                          math_t *V,
-                          math_t PC_perc,
-                          math_t UpS_perc,
-                          bool use_bbt,
-                          bool gen_left_vec,
-                          bool gen_right_vec,
-                          bool use_jacobi,
-                          math_t tol,
-                          int max_sweeps,
-                          cudaStream_t stream) {
-                int k = max((int) (min(n_rows, n_cols) * PC_perc),
-                            1);  // Number of singular values to be computed
-                int p = max((int) (min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
-                rsvdFixedRank(handle,
-                              M,
-                              n_rows,
-                              n_cols,
-                              S_vec,
-                              U,
-                              V,
-                              k,
-                              p,
-                              use_bbt,
-                              gen_left_vec,
-                              gen_right_vec,
-                              use_jacobi,
-                              tol,
-                              max_sweeps,
-                              stream);
-            }
+template <typename math_t>
+void rsvdPerc(const raft::handle_t& handle,
+              math_t* M,
+              int n_rows,
+              int n_cols,
+              math_t* S_vec,
+              math_t* U,
+              math_t* V,
+              math_t PC_perc,
+              math_t UpS_perc,
+              bool use_bbt,
+              bool gen_left_vec,
+              bool gen_right_vec,
+              bool use_jacobi,
+              math_t tol,
+              int max_sweeps,
+              cudaStream_t stream)
+{
+  int k = max((int)(min(n_rows, n_cols) * PC_perc),
+              1);  // Number of singular values to be computed
+  int p = max((int)(min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
+  rsvdFixedRank(handle,
+                M,
+                n_rows,
+                n_cols,
+                S_vec,
+                U,
+                V,
+                k,
+                p,
+                use_bbt,
+                gen_left_vec,
+                gen_right_vec,
+                use_jacobi,
+                tol,
+                max_sweeps,
+                stream);
+}
 
-        };  // end namespace detail
-    };  // end namespace linalg
+};  // end namespace detail
+};  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh
index 935ffed190..bcfcc9df01 100644
--- a/cpp/include/raft/linalg/detail/ternary_op.cuh
+++ b/cpp/include/raft/linalg/detail/ternary_op.cuh
@@ -20,39 +20,41 @@
 #include <raft/vectorized.cuh>
 
 namespace raft {
-    namespace linalg {
+namespace linalg {
 namespace detail {
-    template<typename math_t, int veclen_, typename Lambda, typename IdxType>
-    __global__ void ternaryOpKernel(
-            math_t *out, const math_t *in1, const math_t *in2, const math_t *in3, IdxType len, Lambda op) {
-        typedef raft::TxN_t<math_t, veclen_> VecType;
-        VecType a, b, c;
-        IdxType idx = threadIdx.x + ((IdxType) blockIdx.x * blockDim.x);
-        idx *= VecType::Ratio;
-        if (idx >= len) return;
-        a.load(in1, idx);
-        b.load(in2, idx);
-        c.load(in3, idx);
+template <typename math_t, int veclen_, typename Lambda, typename IdxType>
+__global__ void ternaryOpKernel(
+  math_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op)
+{
+  typedef raft::TxN_t<math_t, veclen_> VecType;
+  VecType a, b, c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+  c.load(in3, idx);
 #pragma unroll
-        for (int i = 0; i < VecType::Ratio; ++i) {
-            a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
-        }
-        a.store(out, idx);
-    }
+  for (int i = 0; i < VecType::Ratio; ++i) {
+    a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
+  }
+  a.store(out, idx);
+}
 
-    template<typename math_t, int veclen_, typename Lambda, typename IdxType, int TPB>
-    void ternaryOpImpl(math_t *out,
-                       const math_t *in1,
-                       const math_t *in2,
-                       const math_t *in3,
-                       IdxType len,
-                       Lambda op,
-                       cudaStream_t stream) {
-        const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType) TPB);
-        ternaryOpKernel<math_t, veclen_, Lambda, IdxType>
-        <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
-        RAFT_CUDA_TRY(cudaPeekAtLastError());
-    }
+template <typename math_t, int veclen_, typename Lambda, typename IdxType, int TPB>
+void ternaryOpImpl(math_t* out,
+                   const math_t* in1,
+                   const math_t* in2,
+                   const math_t* in3,
+                   IdxType len,
+                   Lambda op,
+                   cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
+  ternaryOpKernel<math_t, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
 
 /**
  * @brief perform element-wise ternary operation on the input arrays
@@ -68,35 +70,36 @@ namespace detail {
  * @param op the device-lambda
  * @param stream cuda stream where to launch work
  */
-    template<typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
-    void ternaryOp(math_t *out,
-                   const math_t *in1,
-                   const math_t *in2,
-                   const math_t *in3,
-                   IdxType len,
-                   Lambda op,
-                   cudaStream_t stream) {
-        size_t bytes = len * sizeof(math_t);
-        if (16 / sizeof(math_t) && bytes % 16 == 0) {
-            ternaryOpImpl<math_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
-                    out, in1, in2, in3, len, op, stream);
-        } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
-            ternaryOpImpl<math_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
-                    out, in1, in2, in3, len, op, stream);
-        } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
-            ternaryOpImpl<math_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
-                    out, in1, in2, in3, len, op, stream);
-        } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
-            ternaryOpImpl<math_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
-                    out, in1, in2, in3, len, op, stream);
-        } else if (1 / sizeof(math_t)) {
-            ternaryOpImpl<math_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
-                    out, in1, in2, in3, len, op, stream);
-        } else {
-            ternaryOpImpl<math_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
-        }
-    }
+template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  size_t bytes = len * sizeof(math_t);
+  if (16 / sizeof(math_t) && bytes % 16 == 0) {
+    ternaryOpImpl<math_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
+    ternaryOpImpl<math_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
+    ternaryOpImpl<math_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
+    ternaryOpImpl<math_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (1 / sizeof(math_t)) {
+    ternaryOpImpl<math_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else {
+    ternaryOpImpl<math_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
+  }
+}
 
 };  // end namespace detail
-    };  // end namespace linalg
+};  // end namespace linalg
 };  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index bfa302eb4f..5540cca3a5 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -36,7 +36,7 @@ void lstsqSvdQR(const raft::handle_t& handle,
                 math_t* w,
                 cudaStream_t stream)
 {
-    detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
+  detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
 }
 
 /** Solves the linear ordinary least squares problem `Aw = b`
@@ -54,7 +54,7 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
                     math_t* w,
                     cudaStream_t stream)
 {
-    detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
+  detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
 }
 
 /** Solves the linear ordinary least squares problem `Aw = b`
@@ -70,7 +70,7 @@ void lstsqEig(const raft::handle_t& handle,
               math_t* w,
               cudaStream_t stream)
 {
-    detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
+  detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
 }
 
 /** Solves the linear ordinary least squares problem `Aw = b`
@@ -91,7 +91,7 @@ void lstsqQR(const raft::handle_t& handle,
              math_t* w,
              cudaStream_t stream)
 {
-    detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
+  detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
 }
 
 };  // namespace linalg
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 1a39d4b3ba..07760f0c5c 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -20,7 +20,7 @@
 #include <raft/linalg/unary_op.hpp>
 
 namespace raft {
-    namespace linalg {
+namespace linalg {
 
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
@@ -33,11 +33,11 @@ namespace raft {
  * @param stream cuda stream where to launch work
  * @{
  */
-template<typename math_t, typename IdxType = int>
-void powerScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cudaStream_t stream) {
-    raft::linalg::unaryOp(
-            out, in, len,[scalar] __device__(math_t
-    in) { return raft::myPow(in, scalar); }, stream);
+template <typename math_t, typename IdxType = int>
+void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream);
 }
 /** @} */
 
@@ -52,12 +52,13 @@ void powerScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, cuda
  * @param stream cuda stream where to launch work
  * @{
  */
-template<typename math_t, typename IdxType = int>
-void power(math_t *out, const math_t *in1, const math_t *in2, IdxType len, cudaStream_t stream) {
-    raft::linalg::binaryOp(
-            out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
+template <typename math_t, typename IdxType = int>
+void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
 }
 /** @} */
 
-    };  // end namespace linalg
+};  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index c6e163d491..82d272671c 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -21,7 +21,6 @@
 namespace raft {
 namespace linalg {
 
-
 /**
  * @brief Computes the sum-reduction of matrix columns for each given key
  * @tparam T the input data type (as well as the output reduced matrix)
@@ -40,15 +39,16 @@ namespace linalg {
  * @param nkeys number of unique keys in the keys array
  * @param stream cuda stream to launch the kernel onto
  */
-template<typename T, typename KeyIteratorT, typename IdxType = int>
-void reduce_cols_by_key(const T *data,
+template <typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T* data,
                         const KeyIteratorT keys,
-                        T *out,
+                        T* out,
                         IdxType nrows,
                         IdxType ncols,
                         IdxType nkeys,
-                        cudaStream_t stream) {
-    detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
+                        cudaStream_t stream)
+{
+  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
 }
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 3b5345a540..986f5e8a7f 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -39,18 +39,20 @@ namespace linalg {
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
  */
-template<typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
 void reduce_rows_by_key(const DataIteratorT d_A,
                         int lda,
                         const KeysIteratorT d_keys,
-                        const WeightT *d_weights,
-                        char *d_keys_char,
+                        const WeightT* d_weights,
+                        char* d_keys_char,
                         int nrows,
                         int ncols,
                         int nkeys,
                         DataIteratorT d_sums,
-                        cudaStream_t stream) {
-    detail::reduce_rows_by_key(d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+                        cudaStream_t stream)
+{
+  detail::reduce_rows_by_key(
+    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
 }
 
 /**
@@ -69,29 +71,31 @@ void reduce_rows_by_key(const DataIteratorT d_A,
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
  */
-template<typename DataIteratorT, typename KeysIteratorT>
+template <typename DataIteratorT, typename KeysIteratorT>
 void reduce_rows_by_key(const DataIteratorT d_A,
                         int lda,
                         const KeysIteratorT d_keys,
-                        char *d_keys_char,
+                        char* d_keys_char,
                         int nrows,
                         int ncols,
                         int nkeys,
                         DataIteratorT d_sums,
-                        cudaStream_t stream) {
-    typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
-    reduce_rows_by_key(d_A,
-                       lda,
-                       d_keys,
-                       static_cast<DataType *>(nullptr),
-                       d_keys_char,
-                       nrows,
-                       ncols,
-                       nkeys,
-                       d_sums,
-                       stream);
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  reduce_rows_by_key(d_A,
+                     lda,
+                     d_keys,
+                     static_cast<DataType*>(nullptr),
+                     d_keys_char,
+                     nrows,
+                     ncols,
+                     nkeys,
+                     d_sums,
+                     stream);
 }
 
 };  // end namespace detail
 };  // end namespace linalg
-};  // end namespace raft
+}
+;  // end namespace raft
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index e789abce30..d1d739489f 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -42,14 +42,14 @@ namespace linalg {
  * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
  * @param stream cuda stream
  */
-template<typename math_t>
-void rsvdFixedRank(const raft::handle_t &handle,
-                   math_t *M,
+template <typename math_t>
+void rsvdFixedRank(const raft::handle_t& handle,
+                   math_t* M,
                    int n_rows,
                    int n_cols,
-                   math_t *S_vec,
-                   math_t *U,
-                   math_t *V,
+                   math_t* S_vec,
+                   math_t* U,
+                   math_t* V,
                    int k,
                    int p,
                    bool use_bbt,
@@ -58,12 +58,26 @@ void rsvdFixedRank(const raft::handle_t &handle,
                    bool use_jacobi,
                    math_t tol,
                    int max_sweeps,
-                   cudaStream_t stream) {
-
-    detail::rsvdFixedRank(handle, M, n_rows, n_cols, S_vec, U, V, k, p, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream);
+                   cudaStream_t stream)
+{
+  detail::rsvdFixedRank(handle,
+                        M,
+                        n_rows,
+                        n_cols,
+                        S_vec,
+                        U,
+                        V,
+                        k,
+                        p,
+                        use_bbt,
+                        gen_left_vec,
+                        gen_right_vec,
+                        use_jacobi,
+                        tol,
+                        max_sweeps,
+                        stream);
 }
 
-
 /**
  * @brief randomized singular value decomposition (RSVD) on the column major
  * float type input matrix (Jacobi-based), by specifying the PC and upsampling
@@ -85,14 +99,14 @@ void rsvdFixedRank(const raft::handle_t &handle,
  * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
  * @param stream cuda stream
  */
-template<typename math_t>
-void rsvdPerc(const raft::handle_t &handle,
-              math_t *M,
+template <typename math_t>
+void rsvdPerc(const raft::handle_t& handle,
+              math_t* M,
               int n_rows,
               int n_cols,
-              math_t *S_vec,
-              math_t *U,
-              math_t *V,
+              math_t* S_vec,
+              math_t* U,
+              math_t* V,
               math_t PC_perc,
               math_t UpS_perc,
               bool use_bbt,
@@ -101,8 +115,24 @@ void rsvdPerc(const raft::handle_t &handle,
               bool use_jacobi,
               math_t tol,
               int max_sweeps,
-              cudaStream_t stream) {
-    detail::rsvdPerc(handle, M, n_rows, n_cols, S_vec, U, V, PC_perc, UpS_perc, use_bbt, gen_left_vec, gen_right_vec, use_jacobi, tol, max_sweeps, stream);
+              cudaStream_t stream)
+{
+  detail::rsvdPerc(handle,
+                   M,
+                   n_rows,
+                   n_cols,
+                   S_vec,
+                   U,
+                   V,
+                   PC_perc,
+                   UpS_perc,
+                   use_bbt,
+                   gen_left_vec,
+                   gen_right_vec,
+                   use_jacobi,
+                   tol,
+                   max_sweeps,
+                   stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index 49eb6788ef..c431cfdcc0 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -35,8 +35,8 @@ namespace linalg {
 template <typename math_t, typename IdxType = int>
 void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
 {
-    raft::linalg::unaryOp(
-            out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
+  raft::linalg::unaryOp(
+    out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index 99e21fd5a0..be411e6492 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -34,15 +34,16 @@ namespace linalg {
  * @param op the device-lambda
  * @param stream cuda stream where to launch work
  */
-template<typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
-void ternaryOp(math_t *out,
-               const math_t *in1,
-               const math_t *in2,
-               const math_t *in3,
+template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
                IdxType len,
                Lambda op,
-               cudaStream_t stream) {
-    detail::ternaryOp(out, in1, in2, in3, len, op, stream);
+               cudaStream_t stream)
+{
+  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index 8f336d583f..13d9791992 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -16,115 +16,115 @@
 
 #include "test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/linalg/power.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/linalg/power.cuh>
 #include <raft/random/rng.hpp>
 
 namespace linalg {
 
-        template <typename Type>
-        __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
-        {
-            int idx = threadIdx.x + blockIdx.x * blockDim.x;
-            if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); }
-        }
-
-        template <typename Type>
-        void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
-        {
-            static const int TPB = 64;
-            int nblks            = raft::ceildiv(len, TPB);
-            naivePowerElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
-            RAFT_CUDA_TRY(cudaPeekAtLastError());
-        }
-
-        template <typename Type>
-        __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
-        {
-            int idx = threadIdx.x + blockIdx.x * blockDim.x;
-            if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); }
-        }
-
-        template <typename Type>
-        void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
-        {
-            static const int TPB = 64;
-            int nblks            = raft::ceildiv(len, TPB);
-            naivePowerScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
-            RAFT_CUDA_TRY(cudaPeekAtLastError());
-        }
-
-        template <typename T>
-        struct PowerInputs {
-            T tolerance;
-            int len;
-            unsigned long long int seed;
-        };
-
-        template <typename T>
-        ::std::ostream& operator<<(::std::ostream& os, const PowerInputs<T>& dims)
-        {
-            return os;
-        }
-
-        template <typename T>
-        class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
-        protected:
-            PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {}
-
-            void SetUp() override
-            {
-                params = ::testing::TestWithParam<PowerInputs<T>>::GetParam();
-                raft::random::Rng r(params.seed);
-                int len = params.len;
-                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-                in1.resize(len, stream);
-                in2.resize(len, stream);
-                out_ref.resize(len, stream);
-                out.resize(len, stream);
-                r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
-                r.uniform(in2.data(), len, T(1.0), T(2.0), stream);
-
-                naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream);
-                naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream);
-
-                power(out.data(), in1.data(), in2.data(), len, stream);
-                powerScalar(out.data(), out.data(), T(2), len, stream);
-                power(in1.data(), in1.data(), in2.data(), len, stream);
-                powerScalar(in1.data(), in1.data(), T(2), len, stream);
-                RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-            }
-
-        protected:
-            cudaStream_t stream = 0;
-            PowerInputs<T> params;
-            rmm::device_uvector<T> in1, in2, out_ref, out;
-            int device_count = 0;
-        };
-
-        const std::vector<PowerInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
-
-        const std::vector<PowerInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
-
-        typedef PowerTest<float> PowerTestF;
-        TEST_P(PowerTestF, Result)
-    {
-        ASSERT_TRUE(raft::devArrMatch(
-                out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
-
-        ASSERT_TRUE(raft::devArrMatch(
-                out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
-    }
-
-    typedef PowerTest<double> PowerTestD;
-    TEST_P(PowerTestD, Result)
+template <typename Type>
+__global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); }
+}
+
+template <typename Type>
+void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
+  static const int TPB = 64;
+  int nblks            = raft::ceildiv(len, TPB);
+  naivePowerElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename Type>
+__global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); }
+}
+
+template <typename Type>
+void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
+{
+  static const int TPB = 64;
+  int nblks            = raft::ceildiv(len, TPB);
+  naivePowerScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct PowerInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const PowerInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
+ protected:
+  PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<PowerInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    in1.resize(len, stream);
+    in2.resize(len, stream);
+    out_ref.resize(len, stream);
+    out.resize(len, stream);
+    r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
+    r.uniform(in2.data(), len, T(1.0), T(2.0), stream);
+
+    naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream);
+    naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream);
+
+    power(out.data(), in1.data(), in2.data(), len, stream);
+    powerScalar(out.data(), out.data(), T(2), len, stream);
+    power(in1.data(), in1.data(), in2.data(), len, stream);
+    powerScalar(in1.data(), in1.data(), T(2), len, stream);
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  cudaStream_t stream = 0;
+  PowerInputs<T> params;
+  rmm::device_uvector<T> in1, in2, out_ref, out;
+  int device_count = 0;
+};
+
+const std::vector<PowerInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+
+const std::vector<PowerInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+
+typedef PowerTest<float> PowerTestF;
+TEST_P(PowerTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef PowerTest<double> PowerTestD;
+TEST_P(PowerTestD, Result)
 {
-    ASSERT_TRUE(raft::devArrMatch(
-            out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 
-    ASSERT_TRUE(raft::devArrMatch(
-            out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestF, ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
index 55057b4894..072f586bac 100644
--- a/cpp/test/linalg/reduce_cols_by_key.cu
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -16,107 +16,107 @@
 
 #include "test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/linalg/reduce_cols_by_key.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
+#include <raft/linalg/reduce_cols_by_key.cuh>
 #include <raft/random/rng.hpp>
 
 namespace raft {
-    namespace linalg {
+namespace linalg {
 
-        template <typename T>
-        void naiveReduceColsByKey(const T* in,
-                                  const uint32_t* keys,
-                                  T* out_ref,
-                                  uint32_t nrows,
-                                  uint32_t ncols,
-                                  uint32_t nkeys,
-                                  cudaStream_t stream)
-        {
-            std::vector<uint32_t> h_keys(ncols, 0u);
-            raft::copy(&(h_keys[0]), keys, ncols, stream);
-            std::vector<T> h_in(nrows * ncols);
-            raft::copy(&(h_in[0]), in, nrows * ncols, stream);
-            raft::interruptible::synchronize(stream);
-            std::vector<T> out(nrows * nkeys, T(0));
-            for (uint32_t i = 0; i < nrows; ++i) {
-                for (uint32_t j = 0; j < ncols; ++j) {
-                    out[i * nkeys + h_keys[j]] += h_in[i * ncols + j];
-                }
-            }
-            raft::copy(out_ref, &(out[0]), nrows * nkeys, stream);
-            raft::interruptible::synchronize(stream);
-        }
+template <typename T>
+void naiveReduceColsByKey(const T* in,
+                          const uint32_t* keys,
+                          T* out_ref,
+                          uint32_t nrows,
+                          uint32_t ncols,
+                          uint32_t nkeys,
+                          cudaStream_t stream)
+{
+  std::vector<uint32_t> h_keys(ncols, 0u);
+  raft::copy(&(h_keys[0]), keys, ncols, stream);
+  std::vector<T> h_in(nrows * ncols);
+  raft::copy(&(h_in[0]), in, nrows * ncols, stream);
+  raft::interruptible::synchronize(stream);
+  std::vector<T> out(nrows * nkeys, T(0));
+  for (uint32_t i = 0; i < nrows; ++i) {
+    for (uint32_t j = 0; j < ncols; ++j) {
+      out[i * nkeys + h_keys[j]] += h_in[i * ncols + j];
+    }
+  }
+  raft::copy(out_ref, &(out[0]), nrows * nkeys, stream);
+  raft::interruptible::synchronize(stream);
+}
 
-        template <typename T>
-        struct ReduceColsInputs {
-            T tolerance;
-            uint32_t rows;
-            uint32_t cols;
-            uint32_t nkeys;
-            unsigned long long int seed;
-        };
+template <typename T>
+struct ReduceColsInputs {
+  T tolerance;
+  uint32_t rows;
+  uint32_t cols;
+  uint32_t nkeys;
+  unsigned long long int seed;
+};
 
-        template <typename T>
-        ::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs<T>& dims)
-        {
-            return os;
-        }
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs<T>& dims)
+{
+  return os;
+}
 
-        template <typename T>
-        class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T>> {
-        protected:
-            ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {}
+template <typename T>
+class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T>> {
+ protected:
+  ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {}
 
-            void SetUp() override
-            {
-                params = ::testing::TestWithParam<ReduceColsInputs<T>>::GetParam();
-                raft::random::Rng r(params.seed);
-                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-                auto nrows = params.rows;
-                auto ncols = params.cols;
-                auto nkeys = params.nkeys;
-                in.resize(nrows * ncols, stream);
-                keys.resize(ncols, stream);
-                out_ref.resize(nrows * nkeys, stream);
-                out.resize(nrows * nkeys, stream);
-                r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream);
-                r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream);
-                naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream);
-                reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream);
-                raft::interruptible::synchronize(stream);
-            }
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<ReduceColsInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    auto nrows = params.rows;
+    auto ncols = params.cols;
+    auto nkeys = params.nkeys;
+    in.resize(nrows * ncols, stream);
+    keys.resize(ncols, stream);
+    out_ref.resize(nrows * nkeys, stream);
+    out.resize(nrows * nkeys, stream);
+    r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream);
+    r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream);
+    naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream);
+    reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream);
+    raft::interruptible::synchronize(stream);
+  }
 
-            void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
 
-        protected:
-            cudaStream_t stream = 0;
-            ReduceColsInputs<T> params;
-            rmm::device_uvector<T> in, out_ref, out;
-            rmm::device_uvector<uint32_t> keys;
-        };
+ protected:
+  cudaStream_t stream = 0;
+  ReduceColsInputs<T> params;
+  rmm::device_uvector<T> in, out_ref, out;
+  rmm::device_uvector<uint32_t> keys;
+};
 
-        const std::vector<ReduceColsInputs<float>> inputsf = {{0.0001f, 128, 32, 6, 1234ULL},
-                                                              {0.0005f, 121, 63, 10, 1234ULL}};
-        typedef ReduceColsTest<float> ReduceColsTestF;
-        TEST_P(ReduceColsTestF, Result)
-    {
-        ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                                      out.data(),
-                                      params.rows * params.nkeys,
-                                      raft::CompareApprox<float>(params.tolerance)));
-    }
-    INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf));
+const std::vector<ReduceColsInputs<float>> inputsf = {{0.0001f, 128, 32, 6, 1234ULL},
+                                                      {0.0005f, 121, 63, 10, 1234ULL}};
+typedef ReduceColsTest<float> ReduceColsTestF;
+TEST_P(ReduceColsTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.rows * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf));
 
-    const std::vector<ReduceColsInputs<double>> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL},
-                                                            {0.0000001, 121, 63, 10, 1234ULL}};
-    typedef ReduceColsTest<double> ReduceColsTestD;
-    TEST_P(ReduceColsTestD, Result)
+const std::vector<ReduceColsInputs<double>> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL},
+                                                        {0.0000001, 121, 63, 10, 1234ULL}};
+typedef ReduceColsTest<double> ReduceColsTestD;
+TEST_P(ReduceColsTestD, Result)
 {
-    ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                                  out.data(),
-                                  params.rows * params.nkeys,
-                                  raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.rows * params.nkeys,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestD, ::testing::ValuesIn(inputsd2));
 
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index e6dc8cef7f..1bda427e6f 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -17,246 +17,246 @@
 #include "test_utils.h"
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/linalg/reduce_rows_by_key.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/linalg/reduce_rows_by_key.cuh>
 #include <raft/random/rng.hpp>
 
 namespace raft {
-    namespace linalg {
+namespace linalg {
 
-        template <typename Type>
-        __global__ void naiveReduceRowsByKeyKernel(const Type* d_A,
-                                                   int lda,
-                                                   uint32_t* d_keys,
-                                                   const Type* d_weight,
-                                                   char* d_char_keys,
-                                                   int nrows,
-                                                   int ncols,
-                                                   int nkeys,
-                                                   Type* d_sums)
-        {
-            int c = threadIdx.x + blockIdx.x * blockDim.x;
-            if (c >= ncols) return;
-            int this_key = threadIdx.y + blockIdx.y * blockDim.y;
+template <typename Type>
+__global__ void naiveReduceRowsByKeyKernel(const Type* d_A,
+                                           int lda,
+                                           uint32_t* d_keys,
+                                           const Type* d_weight,
+                                           char* d_char_keys,
+                                           int nrows,
+                                           int ncols,
+                                           int nkeys,
+                                           Type* d_sums)
+{
+  int c = threadIdx.x + blockIdx.x * blockDim.x;
+  if (c >= ncols) return;
+  int this_key = threadIdx.y + blockIdx.y * blockDim.y;
 
-            Type sum = 0.0;
-            for (int r = 0; r < nrows; r++) {
-                if (this_key != d_keys[r]) continue;
-                Type wt = 1;
-                if (d_weight) wt = d_weight[r];
-                sum += d_A[lda * r + c] * wt;
-            }
-            d_sums[this_key * ncols + c] = sum;
-        }
-        template <typename Type>
-        void naiveReduceRowsByKey(const Type* d_A,
-                                  int lda,
-                                  uint32_t* d_keys,
-                                  const Type* d_weight,
-                                  char* d_char_keys,
-                                  int nrows,
-                                  int ncols,
-                                  int nkeys,
-                                  Type* d_sums,
-                                  cudaStream_t stream)
-        {
-            cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols);
+  Type sum = 0.0;
+  for (int r = 0; r < nrows; r++) {
+    if (this_key != d_keys[r]) continue;
+    Type wt = 1;
+    if (d_weight) wt = d_weight[r];
+    sum += d_A[lda * r + c] * wt;
+  }
+  d_sums[this_key * ncols + c] = sum;
+}
+template <typename Type>
+void naiveReduceRowsByKey(const Type* d_A,
+                          int lda,
+                          uint32_t* d_keys,
+                          const Type* d_weight,
+                          char* d_char_keys,
+                          int nrows,
+                          int ncols,
+                          int nkeys,
+                          Type* d_sums,
+                          cudaStream_t stream)
+{
+  cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols);
 
-            naiveReduceRowsByKeyKernel<<<dim3((ncols + 31) / 32, nkeys), dim3(32, 1), 0, stream>>>(
-                    d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums);
-        }
+  naiveReduceRowsByKeyKernel<<<dim3((ncols + 31) / 32, nkeys), dim3(32, 1), 0, stream>>>(
+    d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums);
+}
 
-        template <typename T>
-        struct ReduceRowsInputs {
-            T tolerance;
-            int nobs;
-            uint32_t cols;
-            uint32_t nkeys;
-            unsigned long long int seed;
-            bool weighted;
-            T max_weight;
-        };
+template <typename T>
+struct ReduceRowsInputs {
+  T tolerance;
+  int nobs;
+  uint32_t cols;
+  uint32_t nkeys;
+  unsigned long long int seed;
+  bool weighted;
+  T max_weight;
+};
 
-        template <typename T>
-        ::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs<T>& dims)
-        {
-            return os;
-        }
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs<T>& dims)
+{
+  return os;
+}
 
-        template <typename T>
-        class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
-        public:
-            ReduceRowTest()
-                    : params(::testing::TestWithParam<ReduceRowsInputs<T>>::GetParam()),
-                      stream(handle.get_stream()),
-                      in(params.nobs * params.cols, stream),
-                      out(params.nkeys * params.cols, stream),
-                      out_ref(params.nkeys * params.cols, stream),
-                      keys(params.nobs, stream),
-                      scratch_buf(params.nobs, stream)
-            {
-            }
+template <typename T>
+class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
+ public:
+  ReduceRowTest()
+    : params(::testing::TestWithParam<ReduceRowsInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.nobs * params.cols, stream),
+      out(params.nkeys * params.cols, stream),
+      out_ref(params.nkeys * params.cols, stream),
+      keys(params.nobs, stream),
+      scratch_buf(params.nobs, stream)
+  {
+  }
 
-        protected:
-            void SetUp() override
-            {
-                raft::random::Rng r(params.seed);
-                raft::random::Rng r_int(params.seed);
+ protected:
+  void SetUp() override
+  {
+    raft::random::Rng r(params.seed);
+    raft::random::Rng r_int(params.seed);
 
-                int nobs       = params.nobs;
-                uint32_t cols  = params.cols;
-                uint32_t nkeys = params.nkeys;
-                r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream);
-                r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream);
+    int nobs       = params.nobs;
+    uint32_t cols  = params.cols;
+    uint32_t nkeys = params.nkeys;
+    r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream);
+    r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream);
 
-                rmm::device_uvector<T> weight(0, stream);
-                if (params.weighted) {
-                    weight.resize(nobs, stream);
-                    raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox);
-                    r.uniform(weight.data(), nobs, T(1), params.max_weight, stream);
-                }
+    rmm::device_uvector<T> weight(0, stream);
+    if (params.weighted) {
+      weight.resize(nobs, stream);
+      raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox);
+      r.uniform(weight.data(), nobs, T(1), params.max_weight, stream);
+    }
 
-                naiveReduceRowsByKey(in.data(),
-                                     cols,
-                                     keys.data(),
-                                     params.weighted ? weight.data() : nullptr,
-                                     scratch_buf.data(),
-                                     nobs,
-                                     cols,
-                                     nkeys,
-                                     out_ref.data(),
-                                     stream);
-                if (params.weighted) {
-                    reduce_rows_by_key(in.data(),
-                                       cols,
-                                       keys.data(),
-                                       params.weighted ? weight.data() : nullptr,
-                                       scratch_buf.data(),
-                                       nobs,
-                                       cols,
-                                       nkeys,
-                                       out.data(),
-                                       stream);
-                } else {
-                    reduce_rows_by_key(
-                            in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream);
-                }
-                handle.sync_stream(stream);
-            }
+    naiveReduceRowsByKey(in.data(),
+                         cols,
+                         keys.data(),
+                         params.weighted ? weight.data() : nullptr,
+                         scratch_buf.data(),
+                         nobs,
+                         cols,
+                         nkeys,
+                         out_ref.data(),
+                         stream);
+    if (params.weighted) {
+      reduce_rows_by_key(in.data(),
+                         cols,
+                         keys.data(),
+                         params.weighted ? weight.data() : nullptr,
+                         scratch_buf.data(),
+                         nobs,
+                         cols,
+                         nkeys,
+                         out.data(),
+                         stream);
+    } else {
+      reduce_rows_by_key(
+        in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream);
+    }
+    handle.sync_stream(stream);
+  }
 
-        protected:
-            ReduceRowsInputs<T> params;
-            raft::handle_t handle;
-            cudaStream_t stream = 0;
+ protected:
+  ReduceRowsInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
 
-            int device_count = 0;
-            rmm::device_uvector<T> in, out, out_ref;
-            rmm::device_uvector<uint32_t> keys;
-            rmm::device_uvector<char> scratch_buf;
-        };
+  int device_count = 0;
+  rmm::device_uvector<T> in, out, out_ref;
+  rmm::device_uvector<uint32_t> keys;
+  rmm::device_uvector<char> scratch_buf;
+};
 
 // ReduceRowTestF
 // 128 Obs, 32 cols, 6 clusters
-        const std::vector<ReduceRowsInputs<float>> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false},
-                                                               {0.000001f, 128, 32, 6, 1234ULL, true, 1.0},
-                                                               {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}};
-        typedef ReduceRowTest<float> ReduceRowTestF;
-        TEST_P(ReduceRowTestF, Result)
-    {
-        ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                                      out.data(),
-                                      params.cols * params.nkeys,
-                                      raft::CompareApprox<float>(params.tolerance)));
-    }
-    INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2));
+const std::vector<ReduceRowsInputs<float>> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false},
+                                                       {0.000001f, 128, 32, 6, 1234ULL, true, 1.0},
+                                                       {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}};
+typedef ReduceRowTest<float> ReduceRowTestF;
+TEST_P(ReduceRowTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2));
 
 // ReduceRowTestD
 // 128 Obs, 32 cols, 6 clusters, double precision
-    const std::vector<ReduceRowsInputs<double>> inputsd2 = {
-            {0.00000001, 128, 32, 6, 1234ULL, false},
-            {0.00000001, 128, 32, 6, 1234ULL, true, 2.0},
-            {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}};
-    typedef ReduceRowTest<double> ReduceRowTestD;
-    TEST_P(ReduceRowTestD, Result)
+const std::vector<ReduceRowsInputs<double>> inputsd2 = {
+  {0.00000001, 128, 32, 6, 1234ULL, false},
+  {0.00000001, 128, 32, 6, 1234ULL, true, 2.0},
+  {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}};
+typedef ReduceRowTest<double> ReduceRowTestD;
+TEST_P(ReduceRowTestD, Result)
 {
-    ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                                  out.data(),
-                                  params.cols * params.nkeys,
-                                  raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestD, ::testing::ValuesIn(inputsd2));
 
 // ReduceRowTestSmallnKey
 // 128 Obs, 32 cols, 3 clusters
 const std::vector<ReduceRowsInputs<float>> inputsf_small_nkey = {
-        {0.000001f, 128, 32, 3, 1234ULL, false},
-        {0.000001f, 128, 32, 3, 1234ULL, true, 5.0},
-        {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}};
+  {0.000001f, 128, 32, 3, 1234ULL, false},
+  {0.000001f, 128, 32, 3, 1234ULL, true, 5.0},
+  {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}};
 typedef ReduceRowTest<float> ReduceRowTestSmallnKey;
 TEST_P(ReduceRowTestSmallnKey, Result)
 {
-ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                              out.data(),
-                              params.cols * params.nkeys,
-                              raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ReduceRowTests,
-        ReduceRowTestSmallnKey,
-        ::testing::ValuesIn(inputsf_small_nkey));
+                        ReduceRowTestSmallnKey,
+                        ::testing::ValuesIn(inputsf_small_nkey));
 
 // ReduceRowTestBigSpace
 // 512 Obs, 1024 cols, 32 clusters, double precision
 const std::vector<ReduceRowsInputs<double>> inputsd_big_space = {
-        {0.00000001, 512, 1024, 40, 1234ULL, false},
-        {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0},
-        {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}};
+  {0.00000001, 512, 1024, 40, 1234ULL, false},
+  {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0},
+  {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}};
 typedef ReduceRowTest<double> ReduceRowTestBigSpace;
 TEST_P(ReduceRowTestBigSpace, Result)
 {
-ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                              out.data(),
-                              params.cols * params.nkeys,
-                              raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ReduceRowTests,
-        ReduceRowTestBigSpace,
-        ::testing::ValuesIn(inputsd_big_space));
+                        ReduceRowTestBigSpace,
+                        ::testing::ValuesIn(inputsd_big_space));
 
 // ReduceRowTestManyObs
 // 100000 Obs, 37 cols, 32 clusters
 const std::vector<ReduceRowsInputs<float>> inputsf_many_obs = {
-        {0.00001f, 100000, 37, 32, 1234ULL, false},
-        {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0},
-        {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}};
+  {0.00001f, 100000, 37, 32, 1234ULL, false},
+  {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0},
+  {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}};
 typedef ReduceRowTest<float> ReduceRowTestManyObs;
 TEST_P(ReduceRowTestManyObs, Result)
 {
-ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                              out.data(),
-                              params.cols * params.nkeys,
-                              raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ReduceRowTests,
-        ReduceRowTestManyObs,
-        ::testing::ValuesIn(inputsf_many_obs));
+                        ReduceRowTestManyObs,
+                        ::testing::ValuesIn(inputsf_many_obs));
 
 // ReduceRowTestManyClusters
 // 100000 Obs, 37 cols, 2048 clusters
 const std::vector<ReduceRowsInputs<float>> inputsf_many_cluster = {
-        {0.00001f, 100000, 37, 2048, 1234ULL, false},
-        {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0},
-        {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}};
+  {0.00001f, 100000, 37, 2048, 1234ULL, false},
+  {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0},
+  {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}};
 typedef ReduceRowTest<float> ReduceRowTestManyClusters;
 TEST_P(ReduceRowTestManyClusters, Result)
 {
-ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
-                              out.data(),
-                              params.cols * params.nkeys,
-                              raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ReduceRowTests,
-        ReduceRowTestManyClusters,
-        ::testing::ValuesIn(inputsf_many_cluster));
+                        ReduceRowTestManyClusters,
+                        ::testing::ValuesIn(inputsf_many_cluster));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 260ea07268..7a315ddde6 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -16,283 +16,283 @@
 
 #include "test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/linalg/rsvd.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/rsvd.cuh>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
-    namespace linalg {
-
-        template <typename T>
-        struct RsvdInputs {
-            T tolerance;
-            int n_row;
-            int n_col;
-            T PC_perc;
-            T UpS_perc;
-            int k;
-            int p;
-            bool use_bbt;
-            unsigned long long int seed;
-        };
-
-        template <typename T>
-        ::std::ostream& operator<<(::std::ostream& os, const RsvdInputs<T>& dims)
-        {
-            return os;
-        }
-
-        template <typename T>
-        class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
-        protected:
-            RsvdTest()
-                    : A(0, stream),
-                      U(0, stream),
-                      S(0, stream),
-                      V(0, stream),
-                      left_eig_vectors_ref(0, stream),
-                      right_eig_vectors_ref(0, stream),
-                      sing_vals_ref(0, stream)
-            {
-            }
-
-            void SetUp() override
-            {
-                raft::handle_t handle;
-                stream = handle.get_stream();
-
-                params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
-                // rSVD seems to be very sensitive to the random number sequence as well!
-                raft::random::Rng r(params.seed, raft::random::GenTaps);
-                int m = params.n_row, n = params.n_col;
-                T eig_svd_tol  = 1.e-7;
-                int max_sweeps = 100;
-
-                T mu = 0.0, sigma = 1.0;
-                A.resize(m * n, stream);
-                if (params.tolerance > 1) {  // Sanity check
-                    ASSERT(m == 3, "This test only supports mxn=3x2!");
-                    ASSERT(m * n == 6, "This test only supports mxn=3x2!");
-                    T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
-                    raft::update_device(A.data(), data_h, m * n, stream);
-
-                    T left_eig_vectors_ref_h[]  = {-0.308219, -0.906133, -0.289695};
-                    T right_eig_vectors_ref_h[] = {-0.638636, -0.769509};
-                    T sing_vals_ref_h[]         = {7.065283};
-
-                    left_eig_vectors_ref.resize(m, stream);
-                    right_eig_vectors_ref.resize(n, stream);
-                    sing_vals_ref.resize(1, stream);
-
-                    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream);
-                    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
-                    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
-
-                } else {  // Other normal tests
-                    r.normal(A.data(), m * n, mu, sigma, stream);
-                }
-                std::vector<T> A_backup_cpu(m *
-                                            n);  // Backup A matrix as svdJacobi will destroy the content of A
-                raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream);
-
-                if (params.k == 0) {
-                    params.k = max((int)(min(m, n) * params.PC_perc), 1);
-                    params.p = max((int)(min(m, n) * params.UpS_perc), 1);
-                }
-
-                U.resize(m * params.k, stream);
-                S.resize(params.k, stream);
-                V.resize(n * params.k, stream);
-                RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream));
-                RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream));
-                RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream));
-
-                // RSVD tests
-                if (params.k == 0) {  // Test with PC and upsampling ratio
-                    rsvdPerc(handle,
-                             A.data(),
-                             m,
-                             n,
-                             S.data(),
-                             U.data(),
-                             V.data(),
-                             params.PC_perc,
-                             params.UpS_perc,
-                             params.use_bbt,
-                             true,
-                             true,
-                             false,
-                             eig_svd_tol,
-                             max_sweeps,
-                             stream);
-                } else {  // Test with directly given fixed rank
-                    rsvdFixedRank(handle,
-                                  A.data(),
-                                  m,
-                                  n,
-                                  S.data(),
-                                  U.data(),
-                                  V.data(),
-                                  params.k,
-                                  params.p,
-                                  params.use_bbt,
-                                  true,
-                                  true,
-                                  true,
-                                  eig_svd_tol,
-                                  max_sweeps,
-                                  stream);
-                }
-                raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream);
-            }
-
-        protected:
-            cudaStream_t stream = 0;
-            RsvdInputs<T> params;
-            rmm::device_uvector<T> A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
-        };
-
-        const std::vector<RsvdInputs<float>> inputs_fx = {
-                // Test with ratios
-                {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
-                {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
-                {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
-                {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
-                {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
-                {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
-                {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
-                {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-                ,                                                         // Test with fixed ranks
-                {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
-                {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
-                {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
-                {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
-                {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
-                {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
-                {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
-                {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
-        };
-
-        const std::vector<RsvdInputs<double>> inputs_dx = {
-                // Test with ratios
-                {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
-                {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
-                {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
-                {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
-                {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
-                {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
-                {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
-                {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-                ,                                                      // Test with fixed ranks
-                {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
-                {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
-                {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
-                {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
-                {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
-                {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
-                {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
-                {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
-        };
-
-        const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
-                {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
-                {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
-                {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
-                {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
-
-        const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
-                {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
-                {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
-                {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
-                {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
-
-        typedef RsvdTest<float> RsvdSanityCheckValF;
-        TEST_P(RsvdSanityCheckValF, Result)
-    {
-        ASSERT_TRUE(devArrMatch(
-                sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<float>(params.tolerance)));
+namespace linalg {
+
+template <typename T>
+struct RsvdInputs {
+  T tolerance;
+  int n_row;
+  int n_col;
+  T PC_perc;
+  T UpS_perc;
+  int k;
+  int p;
+  bool use_bbt;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const RsvdInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
+ protected:
+  RsvdTest()
+    : A(0, stream),
+      U(0, stream),
+      S(0, stream),
+      V(0, stream),
+      left_eig_vectors_ref(0, stream),
+      right_eig_vectors_ref(0, stream),
+      sing_vals_ref(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::handle_t handle;
+    stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
+    // rSVD seems to be very sensitive to the random number sequence as well!
+    raft::random::Rng r(params.seed, raft::random::GenTaps);
+    int m = params.n_row, n = params.n_col;
+    T eig_svd_tol  = 1.e-7;
+    int max_sweeps = 100;
+
+    T mu = 0.0, sigma = 1.0;
+    A.resize(m * n, stream);
+    if (params.tolerance > 1) {  // Sanity check
+      ASSERT(m == 3, "This test only supports mxn=3x2!");
+      ASSERT(m * n == 6, "This test only supports mxn=3x2!");
+      T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
+      raft::update_device(A.data(), data_h, m * n, stream);
+
+      T left_eig_vectors_ref_h[]  = {-0.308219, -0.906133, -0.289695};
+      T right_eig_vectors_ref_h[] = {-0.638636, -0.769509};
+      T sing_vals_ref_h[]         = {7.065283};
+
+      left_eig_vectors_ref.resize(m, stream);
+      right_eig_vectors_ref.resize(n, stream);
+      sing_vals_ref.resize(1, stream);
+
+      raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream);
+      raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
+      raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
+
+    } else {  // Other normal tests
+      r.normal(A.data(), m * n, mu, sigma, stream);
     }
+    std::vector<T> A_backup_cpu(m *
+                                n);  // Backup A matrix as svdJacobi will destroy the content of A
+    raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream);
+
+    if (params.k == 0) {
+      params.k = max((int)(min(m, n) * params.PC_perc), 1);
+      params.p = max((int)(min(m, n) * params.UpS_perc), 1);
+    }
+
+    U.resize(m * params.k, stream);
+    S.resize(params.k, stream);
+    V.resize(n * params.k, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream));
+
+    // RSVD tests
+    if (params.k == 0) {  // Test with PC and upsampling ratio
+      rsvdPerc(handle,
+               A.data(),
+               m,
+               n,
+               S.data(),
+               U.data(),
+               V.data(),
+               params.PC_perc,
+               params.UpS_perc,
+               params.use_bbt,
+               true,
+               true,
+               false,
+               eig_svd_tol,
+               max_sweeps,
+               stream);
+    } else {  // Test with directly given fixed rank
+      rsvdFixedRank(handle,
+                    A.data(),
+                    m,
+                    n,
+                    S.data(),
+                    U.data(),
+                    V.data(),
+                    params.k,
+                    params.p,
+                    params.use_bbt,
+                    true,
+                    true,
+                    true,
+                    eig_svd_tol,
+                    max_sweeps,
+                    stream);
+    }
+    raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream);
+  }
+
+ protected:
+  cudaStream_t stream = 0;
+  RsvdInputs<T> params;
+  rmm::device_uvector<T> A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
+};
+
+const std::vector<RsvdInputs<float>> inputs_fx = {
+  // Test with ratios
+  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                         // Test with fixed ranks
+  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
+};
+
+const std::vector<RsvdInputs<double>> inputs_dx = {
+  // Test with ratios
+  {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                      // Test with fixed ranks
+  {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
+};
+
+const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
+  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
+
+const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
+  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
+
+typedef RsvdTest<float> RsvdSanityCheckValF;
+TEST_P(RsvdSanityCheckValF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<float>(params.tolerance)));
+}
 
-    typedef RsvdTest<double> RsvdSanityCheckValD;
-    TEST_P(RsvdSanityCheckValD, Result)
+typedef RsvdTest<double> RsvdSanityCheckValD;
+TEST_P(RsvdSanityCheckValD, Result)
 {
-    ASSERT_TRUE(devArrMatch(
-            sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef RsvdTest<float> RsvdSanityCheckLeftVecF;
 TEST_P(RsvdSanityCheckLeftVecF, Result)
 {
-ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
-                        U.data(),
-                        params.n_row * params.k,
-                        raft::CompareApproxAbs<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
+                          U.data(),
+                          params.n_row * params.k,
+                          raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef RsvdTest<double> RsvdSanityCheckLeftVecD;
 TEST_P(RsvdSanityCheckLeftVecD, Result)
 {
-ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
-                        U.data(),
-                        params.n_row * params.k,
-                        raft::CompareApproxAbs<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
+                          U.data(),
+                          params.n_row * params.k,
+                          raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef RsvdTest<float> RsvdSanityCheckRightVecF;
 TEST_P(RsvdSanityCheckRightVecF, Result)
 {
-ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
-                        V.data(),
-                        params.n_col * params.k,
-                        raft::CompareApproxAbs<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
+                          V.data(),
+                          params.n_col * params.k,
+                          raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef RsvdTest<double> RsvdSanityCheckRightVecD;
 TEST_P(RsvdSanityCheckRightVecD, Result)
 {
-ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
-                        V.data(),
-                        params.n_col * params.k,
-                        raft::CompareApproxAbs<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
+                          V.data(),
+                          params.n_col * params.k,
+                          raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef RsvdTest<float> RsvdTestSquareMatrixNormF;
 TEST_P(RsvdTestSquareMatrixNormF, Result)
 {
-raft::handle_t handle;
-
-ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
-                                              A.data(),
-                                              U.data(),
-                                              S.data(),
-                                              V.data(),
-                                              params.n_row,
-                                              params.n_col,
-                                              params.k,
-                                              4 * params.tolerance,
-                                              handle.get_stream()));
+  raft::handle_t handle;
+
+  ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
+                                                A.data(),
+                                                U.data(),
+                                                S.data(),
+                                                V.data(),
+                                                params.n_row,
+                                                params.n_col,
+                                                params.k,
+                                                4 * params.tolerance,
+                                                handle.get_stream()));
 }
 
 typedef RsvdTest<double> RsvdTestSquareMatrixNormD;
 TEST_P(RsvdTestSquareMatrixNormD, Result)
 {
-raft::handle_t handle;
-
-ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
-                                              A.data(),
-                                              U.data(),
-                                              S.data(),
-                                              V.data(),
-                                              params.n_row,
-                                              params.n_col,
-                                              params.k,
-                                              4 * params.tolerance,
-                                              handle.get_stream()));
+  raft::handle_t handle;
+
+  ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
+                                                A.data(),
+                                                U.data(),
+                                                S.data(),
+                                                V.data(),
+                                                params.n_row,
+                                                params.n_col,
+                                                params.k,
+                                                4 * params.tolerance,
+                                                handle.get_stream()));
 }
 
 INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValF, ::testing::ValuesIn(sanity_inputs_fx));
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index bf64d264ad..f604a8a1ef 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -21,94 +21,94 @@
 #include <raft/random/rng.hpp>
 
 namespace raft {
-    namespace linalg {
-
-        template <typename Type>
-        __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
-        {
-            int idx = threadIdx.x + blockIdx.x * blockDim.x;
-            if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); }
-        }
-
-        template <typename Type>
-        void naiveSqrtElem(Type* out, const Type* in1, int len)
-        {
-            static const int TPB = 64;
-            int nblks            = raft::ceildiv(len, TPB);
-            naiveSqrtElemKernel<Type><<<nblks, TPB>>>(out, in1, len);
-            RAFT_CUDA_TRY(cudaPeekAtLastError());
-        }
-
-        template <typename T>
-        struct SqrtInputs {
-            T tolerance;
-            int len;
-            unsigned long long int seed;
-        };
-
-        template <typename T>
-        ::std::ostream& operator<<(::std::ostream& os, const SqrtInputs<T>& dims)
-        {
-            return os;
-        }
-
-        template <typename T>
-        class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
-        protected:
-            SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {}
-
-            void SetUp() override
-            {
-                params = ::testing::TestWithParam<SqrtInputs<T>>::GetParam();
-                raft::random::Rng r(params.seed);
-                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-                int len = params.len;
-                in1.resize(len, stream);
-                out_ref.resize(len, stream);
-                out.resize(len, stream);
-                r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
-
-                naiveSqrtElem(out_ref.data(), in1.data(), len);
-
-                sqrt(out.data(), in1.data(), len, stream);
-                sqrt(in1.data(), in1.data(), len, stream);
-                RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-            }
-
-        protected:
-            cudaStream_t stream = 0;
-            SqrtInputs<T> params;
-            rmm::device_uvector<T> in1, out_ref, out;
-            int device_count = 0;
-        };
-
-        const std::vector<SqrtInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
-
-        const std::vector<SqrtInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
-
-        typedef SqrtTest<float> SqrtTestF;
-        TEST_P(SqrtTestF, Result)
-    {
-        ASSERT_TRUE(raft::devArrMatch(
-                out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
-
-        ASSERT_TRUE(raft::devArrMatch(
-                out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
-    }
-
-    typedef SqrtTest<double> SqrtTestD;
-    TEST_P(SqrtTestD, Result)
+namespace linalg {
+
+template <typename Type>
+__global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); }
+}
+
+template <typename Type>
+void naiveSqrtElem(Type* out, const Type* in1, int len)
+{
+  static const int TPB = 64;
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveSqrtElemKernel<Type><<<nblks, TPB>>>(out, in1, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct SqrtInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const SqrtInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
+ protected:
+  SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SqrtInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    int len = params.len;
+    in1.resize(len, stream);
+    out_ref.resize(len, stream);
+    out.resize(len, stream);
+    r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
+
+    naiveSqrtElem(out_ref.data(), in1.data(), len);
+
+    sqrt(out.data(), in1.data(), len, stream);
+    sqrt(in1.data(), in1.data(), len, stream);
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  cudaStream_t stream = 0;
+  SqrtInputs<T> params;
+  rmm::device_uvector<T> in1, out_ref, out;
+  int device_count = 0;
+};
+
+const std::vector<SqrtInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+
+const std::vector<SqrtInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+
+typedef SqrtTest<float> SqrtTestF;
+TEST_P(SqrtTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef SqrtTest<double> SqrtTestD;
+TEST_P(SqrtTestD, Result)
 {
-    ASSERT_TRUE(raft::devArrMatch(
-            out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 
-    ASSERT_TRUE(raft::devArrMatch(
-            out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestD, ::testing::ValuesIn(inputsd2));
 
-}  // end namespace LinAlg
-}  // end namespace MLCommon
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
index 83ec3e6029..168b0cd31d 100644
--- a/cpp/test/linalg/ternary_op.cu
+++ b/cpp/test/linalg/ternary_op.cu
@@ -16,90 +16,90 @@
 
 #include "test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/linalg/ternary_op.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/linalg/ternary_op.cuh>
 #include <raft/random/rng.hpp>
 
 namespace raft {
-    namespace linalg {
+namespace linalg {
 
-        template <typename InType, typename IdxType = int, typename OutType = InType>
-        struct BinaryOpInputs {
-            InType tolerance;
-            IdxType len;
-            unsigned long long int seed;
-        };
+template <typename InType, typename IdxType = int, typename OutType = InType>
+struct BinaryOpInputs {
+  InType tolerance;
+  IdxType len;
+  unsigned long long int seed;
+};
 
-        template <typename InType, typename IdxType = int, typename OutType = InType>
-        ::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
-        {
-            return os;
-        }
+template <typename InType, typename IdxType = int, typename OutType = InType>
+::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
+{
+  return os;
+}
 
-        template <typename T>
-        class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
-        public:
-            ternaryOpTest()
-                    : params(::testing::TestWithParam<BinaryOpInputs<T>>::GetParam()),
-                      stream(handle.get_stream()),
-                      out_add_ref(params.len, stream),
-                      out_add(params.len, stream),
-                      out_mul_ref(params.len, stream),
-                      out_mul(params.len, stream)
-            {
-            }
+template <typename T>
+class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
+ public:
+  ternaryOpTest()
+    : params(::testing::TestWithParam<BinaryOpInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      out_add_ref(params.len, stream),
+      out_add(params.len, stream),
+      out_mul_ref(params.len, stream),
+      out_mul(params.len, stream)
+  {
+  }
 
-            void SetUp() override
-            {
-                raft::random::Rng rng(params.seed);
-                int len = params.len;
-                rmm::device_uvector<T> in1(len, stream);
-                rmm::device_uvector<T> in2(len, stream);
-                rmm::device_uvector<T> in3(len, stream);
+  void SetUp() override
+  {
+    raft::random::Rng rng(params.seed);
+    int len = params.len;
+    rmm::device_uvector<T> in1(len, stream);
+    rmm::device_uvector<T> in2(len, stream);
+    rmm::device_uvector<T> in3(len, stream);
 
-                rng.fill(out_add_ref.data(), len, T(6.0), stream);
-                rng.fill(out_mul_ref.data(), len, T(6.0), stream);
-                rng.fill(in1.data(), len, T(1.0), stream);
-                rng.fill(in2.data(), len, T(2.0), stream);
-                rng.fill(in3.data(), len, T(3.0), stream);
+    rng.fill(out_add_ref.data(), len, T(6.0), stream);
+    rng.fill(out_mul_ref.data(), len, T(6.0), stream);
+    rng.fill(in1.data(), len, T(1.0), stream);
+    rng.fill(in2.data(), len, T(2.0), stream);
+    rng.fill(in3.data(), len, T(3.0), stream);
 
-                auto add = [] __device__(T a, T b, T c) { return a + b + c; };
-                auto mul = [] __device__(T a, T b, T c) { return a * b * c; };
-                ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream);
-                ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream);
-            }
+    auto add = [] __device__(T a, T b, T c) { return a + b + c; };
+    auto mul = [] __device__(T a, T b, T c) { return a * b * c; };
+    ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream);
+    ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream);
+  }
 
-        protected:
-            BinaryOpInputs<T> params;
-            raft::handle_t handle;
-            cudaStream_t stream = 0;
+ protected:
+  BinaryOpInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
 
-            rmm::device_uvector<T> out_add_ref, out_add, out_mul_ref, out_mul;
-        };
+  rmm::device_uvector<T> out_add_ref, out_add, out_mul_ref, out_mul;
+};
 
-        const std::vector<BinaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 1234ULL},
-                                                            {0.000001f, 1024 * 1024 + 2, 1234ULL},
-                                                            {0.000001f, 1024 * 1024 + 1, 1234ULL}};
-        typedef ternaryOpTest<float> ternaryOpTestF;
-        TEST_P(ternaryOpTestF, Result)
-    {
-        ASSERT_TRUE(devArrMatch(
-                out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
-        ASSERT_TRUE(devArrMatch(
-                out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
-    }
-    INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf));
+const std::vector<BinaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 1234ULL},
+                                                    {0.000001f, 1024 * 1024 + 2, 1234ULL},
+                                                    {0.000001f, 1024 * 1024 + 1, 1234ULL}};
+typedef ternaryOpTest<float> ternaryOpTestF;
+TEST_P(ternaryOpTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf));
 
-    const std::vector<BinaryOpInputs<double>> inputsd = {{0.00000001, 1024 * 1024, 1234ULL},
-                                                         {0.00000001, 1024 * 1024 + 2, 1234ULL},
-                                                         {0.00000001, 1024 * 1024 + 1, 1234ULL}};
-    typedef ternaryOpTest<double> ternaryOpTestD;
-    TEST_P(ternaryOpTestD, Result)
+const std::vector<BinaryOpInputs<double>> inputsd = {{0.00000001, 1024 * 1024, 1234ULL},
+                                                     {0.00000001, 1024 * 1024 + 2, 1234ULL},
+                                                     {0.00000001, 1024 * 1024 + 1, 1234ULL}};
+typedef ternaryOpTest<double> ternaryOpTestD;
+TEST_P(ternaryOpTestD, Result)
 {
-    ASSERT_TRUE(devArrMatch(
-            out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
-    ASSERT_TRUE(devArrMatch(
-            out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestD, ::testing::ValuesIn(inputsd));
 

From 4909d2c1a005d169e55488a087c38245c67a23a5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 19:16:07 -0500
Subject: [PATCH 03/24] Updarting style

---
 cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh | 5 +++--
 cpp/include/raft/linalg/power.cuh                     | 1 +
 cpp/test/linalg/power.cu                              | 1 +
 cpp/test/linalg/sqrt.cu                               | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index c88895807d..680c95f1f4 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -301,8 +301,9 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT
     // same for the whole block
     sh_key_inx++;
 #else
-    if (d_keys[r] != global_key) continue;  // No divergence since global_key is the
-                                            // same for the whole block
+    if (d_keys[r] != global_key)
+      continue;  // No divergence since global_key is the
+                 // same for the whole block
 #endif
     // if ((end_row-start_row) / (r-start_row) != global_key) continue;
     DataType val = __ldcg(&d_A[r * lda + this_col]);
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 07760f0c5c..d17fa9a043 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
 #include <raft/linalg/unary_op.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index 13d9791992..d3c76e1049 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -20,6 +20,7 @@
 #include <raft/linalg/power.cuh>
 #include <raft/random/rng.hpp>
 
+namespace raft {
 namespace linalg {
 
 template <typename Type>
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index f604a8a1ef..27fa0f8959 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -16,8 +16,8 @@
 
 #include "test_utils.h"
 #include <gtest/gtest.h>
-#include <linalg/sqrt.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/linalg/sqrt.cuh>
 #include <raft/random/rng.hpp>
 
 namespace raft {

From df48d3405f86feea9ac405aeb6a2ededfb045b10 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 19:29:51 -0500
Subject: [PATCH 04/24] Fixing include for test utils

---
 cpp/test/linalg/power.cu              | 2 +-
 cpp/test/linalg/reduce_cols_by_key.cu | 2 +-
 cpp/test/linalg/reduce_rows_by_key.cu | 2 +-
 cpp/test/linalg/rsvd.cu               | 2 +-
 cpp/test/linalg/sqrt.cu               | 2 +-
 cpp/test/linalg/ternary_op.cu         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index d3c76e1049..8c862bbeab 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/power.cuh>
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
index 072f586bac..94459769f8 100644
--- a/cpp/test/linalg/reduce_cols_by_key.cu
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index 1bda427e6f..9219c4f561 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 7a315ddde6..b8e44580b5 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index 27fa0f8959..6aa6376c26 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/sqrt.cuh>
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
index 168b0cd31d..4140a9c4b3 100644
--- a/cpp/test/linalg/ternary_op.cu
+++ b/cpp/test/linalg/ternary_op.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/ternary_op.cuh>

From 3c2fc7e31c222978378b23a18fd8fe369822285d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 19:34:44 -0500
Subject: [PATCH 05/24] Updating lstsq

---
 cpp/include/raft/linalg/detail/lstsq.hpp | 61 +++++++++++++-----------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp
index c91d6e41c1..6553394cc4 100644
--- a/cpp/include/raft/linalg/detail/lstsq.hpp
+++ b/cpp/include/raft/linalg/detail/lstsq.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <common/nvtx.hpp>
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
@@ -50,7 +51,7 @@ struct DeviceEvent {
   DeviceEvent(bool concurrent)
   {
     if (concurrent)
-      RAFT_CUDA_TRY(cudaEventCreate(&e));
+      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
     else
       e = nullptr;
   }
@@ -60,23 +61,16 @@ struct DeviceEvent {
     if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e));
   }
 
-  operator cudaEvent_t() const { return e; }
-
   void record(cudaStream_t stream)
   {
     if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream));
   }
 
-  void wait(cudaStream_t stream)
+  void wait_by(cudaStream_t stream)
   {
     if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u));
   }
 
-  void wait()
-  {
-    if (e != nullptr) raft::interruptible::synchronize(e);
-  }
-
   DeviceEvent& operator=(const DeviceEvent& other) = delete;
 };
 
@@ -265,27 +259,26 @@ void lstsqEig(const raft::handle_t& handle,
               cudaStream_t stream)
 {
   rmm::cuda_stream_view mainStream   = rmm::cuda_stream_view(stream);
-  rmm::cuda_stream_view multAbStream = mainStream;
-  bool concurrent                    = false;
-  {
-    int sp_size = handle.get_stream_pool_size();
-    if (sp_size > 0) {
-      multAbStream = handle.get_stream_from_stream_pool(0);
-      // check if the two streams can run concurrently
-      if (!are_implicitly_synchronized(mainStream, multAbStream)) {
-        concurrent = true;
-      } else if (sp_size > 1) {
-        mainStream   = multAbStream;
-        multAbStream = handle.get_stream_from_stream_pool(1);
-        concurrent   = true;
-      }
-    }
+  rmm::cuda_stream_view multAbStream = handle.get_next_usable_stream();
+  bool concurrent;
+  // Check if the two streams can run concurrently. This is needed because a legacy default stream
+  // would synchronize with other blocking streams. To avoid synchronization in such case, we try to
+  // use an additional stream from the pool.
+  if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+    concurrent = true;
+  } else if (handle.get_stream_pool_size() > 1) {
+    mainStream = handle.get_next_usable_stream();
+    concurrent = true;
+  } else {
+    multAbStream = mainStream;
+    concurrent   = false;
   }
-  // the event is created only if the given raft handle is capable of running
-  // at least two CUDA streams without implicit synchronization.
-  DeviceEvent multAbDone(concurrent);
 
   rmm::device_uvector<math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+  // the event is created only if the given raft handle is capable of running
+  // at least two CUDA streams without implicit synchronization.
+  DeviceEvent worksetDone(concurrent);
+  worksetDone.record(mainStream);
   math_t* Q    = workset.data();
   math_t* QS   = Q + n_cols * n_cols;
   math_t* covA = QS + n_cols * n_cols;
@@ -310,7 +303,9 @@ void lstsqEig(const raft::handle_t& handle,
                      mainStream);
 
   // Ab <- A* b
+  worksetDone.wait_by(multAbStream);
   raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream);
+  DeviceEvent multAbDone(concurrent);
   multAbDone.record(multAbStream);
 
   // Q S Q* <- covA
@@ -335,9 +330,18 @@ void lstsqEig(const raft::handle_t& handle,
                      alpha,
                      beta,
                      mainStream);
-  multAbDone.wait(mainStream);
+
+  multAbDone.wait_by(mainStream);
   // w <- covA Ab == Q invS Q* A b == inv(A* A) A b
   raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream);
+
+  // This event is created only if we use two worker streams, and `stream` is not the legacy stream,
+  // and `mainStream` is not a non-blocking stream. In fact, with the current logic these conditions
+  // are impossible together, but it still makes sense to put this construct here to emphasize that
+  // `stream` must wait till the work here is done (for future refactorings).
+  DeviceEvent mainDone(!are_implicitly_synchronized(mainStream, stream));
+  mainDone.record(mainStream);
+  mainDone.wait_by(stream);
 }
 
 /** Solves the linear ordinary least squares problem `Aw = b`
@@ -448,7 +452,6 @@ void lstsqQR(const raft::handle_t& handle,
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream));
 }
-
 };  // namespace detail
 };  // namespace linalg
 };  // namespace raft

From 0b5ba541ed0f5ee202d6faa237839f73d9e8bdfa Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 20:22:55 -0500
Subject: [PATCH 06/24] Adding missing reduction test

---
 cpp/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index c0db20f650..2ace88b498 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -59,6 +59,7 @@ add_executable(test_raft
     test/linalg/power.cu
     test/linalg/reduce.cu
     test/linalg/reduce_cols_by_key.cu
+    test/linalg/reduce_rows_by_key.cu
     test/linalg/rsvd.cu
     test/linalg/sqrt.cu
     test/linalg/strided_reduction.cu

From 1b613b2c974845e03fa3b2f26f4389da898f6d1a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 20:41:51 -0500
Subject: [PATCH 07/24] Fixing lstsq

---
 cpp/include/raft/linalg/lstsq.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index 5540cca3a5..cdf67e422d 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/handle.hpp>
-#include <raft/linalg/detail/lstsq.cuh>
+#include <raft/linalg/detail/lstsq.hpp>
 namespace raft {
 namespace linalg {
 

From c7f059fb6f0646edd7dce5768c391412fb6e4db7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 21:42:10 -0500
Subject: [PATCH 08/24] Typo

---
 cpp/include/raft/linalg/reduce_rows_by_key.cuh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 986f5e8a7f..b97a25dbac 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -95,7 +95,5 @@ void reduce_rows_by_key(const DataIteratorT d_A,
                      stream);
 }
 
-};  // end namespace detail
 };  // end namespace linalg
-}
-;  // end namespace raft
+};  // end namespace raft

From 3f207c59975f377a4e92bd93f59b4ec75e9a9926 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 22:24:17 -0500
Subject: [PATCH 09/24] Exposing convert_array

---
 cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh |  5 +++++
 cpp/include/raft/linalg/reduce_rows_by_key.cuh        | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index 680c95f1f4..aa0b1545d3 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -40,6 +40,11 @@ void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
   }
 }
 
+//
+// Small helper function to convert from int->char and char->int
+// Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+//
+
 template <typename IteratorT1, typename IteratorT2>
 void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
 {
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index b97a25dbac..5b34f5a6ec 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -20,6 +20,17 @@
 
 namespace raft {
 namespace linalg {
+
+/**
+ Small helper function to convert from int->char and char->int
+ Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+**/
+template <typename IteratorT1, typename IteratorT2>
+void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
+{
+  detail::convert_array(dst, src, st);
+}
+
 /**
  * @brief Computes the weighted reduction of matrix rows for each given key
  *

From 34f7bf3a33b5e8d9764304c6e170a9416024cae6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 9 Feb 2022 23:09:36 -0500
Subject: [PATCH 10/24] Oops

---
 cpp/include/raft/linalg/reduce_rows_by_key.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 5b34f5a6ec..76d4ed4971 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -28,7 +28,7 @@ namespace linalg {
 template <typename IteratorT1, typename IteratorT2>
 void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
 {
-  detail::convert_array(dst, src, st);
+  detail::convert_array(dst, src, n, st);
 }
 
 /**

From 68639a296836430106014b06c1cb025c927a48a0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 09:59:30 -0500
Subject: [PATCH 11/24] Adding proper doxygen docs to lstsq

---
 cpp/include/raft/linalg/lstsq.hpp | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index cdf67e422d..57dd0a7b15 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -24,8 +24,15 @@ namespace linalg {
 /** Solves the linear ordinary least squares problem `Aw = b`
  *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
  *
- *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
- *             so it's not guaranteed to stay unmodified.
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
 void lstsqSvdQR(const raft::handle_t& handle,
@@ -42,8 +49,15 @@ void lstsqSvdQR(const raft::handle_t& handle,
 /** Solves the linear ordinary least squares problem `Aw = b`
  *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
  *
- *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
- *             so it's not guaranteed to stay unmodified.
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
 void lstsqSvdJacobi(const raft::handle_t& handle,
@@ -77,10 +91,15 @@ void lstsqEig(const raft::handle_t& handle,
  *  via QR decomposition of `A = QR`.
  *  (triangular system of equations `Rw = Q^T b`)
  *
- * @param A[in/out] - input feature matrix.
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
  *            Warning: the content of this matrix is modified by the cuSOLVER routines.
- * @param b[in/out] - input target vector.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
  *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
 void lstsqQR(const raft::handle_t& handle,

From 34b54f44608eae572a88c974f1dad5a7d94b5240 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 16:35:08 -0500
Subject: [PATCH 12/24] Updating gtests

---
 cpp/test/linalg/power.cu | 16 ++++++++++++----
 cpp/test/linalg/sqrt.cu  | 13 ++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index 8c862bbeab..0ec8613ce7 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -71,14 +71,21 @@ template <typename T>
 template <typename T>
 class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
  protected:
-  PowerTest() : in1(0, stream), in2(0, stream), out_ref(0, stream), out(0, stream) {}
+  PowerTest()
+    : in1(0, handle.get_stream()),
+      in2(0, handle.get_stream()),
+      out_ref(0, handle.get_stream()),
+      out(0, handle.get_stream())
+  {
+  }
 
   void SetUp() override
   {
     params = ::testing::TestWithParam<PowerInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    cudaStream_t stream = handle.get_stream();
 
     in1.resize(len, stream);
     in2.resize(len, stream);
@@ -94,11 +101,12 @@ class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
     powerScalar(out.data(), out.data(), T(2), len, stream);
     power(in1.data(), in1.data(), in2.data(), len, stream);
     powerScalar(in1.data(), in1.data(), T(2), len, stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+
+    handle.sync_stream();
   }
 
  protected:
-  cudaStream_t stream = 0;
+  raft::handle_t handle;
   PowerInputs<T> params;
   rmm::device_uvector<T> in1, in2, out_ref, out;
   int device_count = 0;
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index 6aa6376c26..92c9626395 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -55,13 +55,16 @@ template <typename T>
 template <typename T>
 class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
  protected:
-  SqrtTest() : in1(0, stream), out_ref(0, stream), out(0, stream) {}
+  SqrtTest()
+    : in1(0, handle.get_stream()), out_ref(0, handle.get_stream()), out(0, handle.get_stream())
+  {
+  }
 
   void SetUp() override
   {
-    params = ::testing::TestWithParam<SqrtInputs<T>>::GetParam();
+    auto stream = handle.get_stream();
+    params      = ::testing::TestWithParam<SqrtInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     int len = params.len;
     in1.resize(len, stream);
     out_ref.resize(len, stream);
@@ -72,11 +75,11 @@ class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
 
     sqrt(out.data(), in1.data(), len, stream);
     sqrt(in1.data(), in1.data(), len, stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
-  cudaStream_t stream = 0;
+  raft::handle_t handle;
   SqrtInputs<T> params;
   rmm::device_uvector<T> in1, out_ref, out;
   int device_count = 0;

From 587c0f10e48ffde39e89934370208aa4bbc2993d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 17:04:38 -0500
Subject: [PATCH 13/24] Moving remaining stats stuff over

---
 cpp/include/raft/common/seive.cuh             | 125 +++++
 cpp/include/raft/stats/cov.hpp                |  58 ++
 cpp/include/raft/stats/detail/cov.cuh         |  95 ++++
 cpp/include/raft/stats/detail/histogram.cuh   | 529 ++++++++++++++++++
 cpp/include/raft/stats/detail/minmax.cuh      | 247 ++++++++
 .../raft/stats/detail/weighted_mean.cuh       |  94 ++++
 cpp/include/raft/stats/histogram.hpp          |  60 ++
 cpp/include/raft/stats/minmax.hpp             |  70 +++
 cpp/include/raft/stats/weighted_mean.hpp      |  60 ++
 cpp/test/CMakeLists.txt                       |   5 +
 cpp/test/common/seive.cu                      |  35 ++
 cpp/test/stats/cov.cu                         | 185 ++++++
 cpp/test/stats/histogram.cu                   | 262 +++++++++
 cpp/test/stats/minmax.cu                      | 202 +++++++
 cpp/test/stats/weighted_mean.cu               | 231 ++++++++
 15 files changed, 2258 insertions(+)
 create mode 100644 cpp/include/raft/common/seive.cuh
 create mode 100644 cpp/include/raft/stats/cov.hpp
 create mode 100644 cpp/include/raft/stats/detail/cov.cuh
 create mode 100644 cpp/include/raft/stats/detail/histogram.cuh
 create mode 100644 cpp/include/raft/stats/detail/minmax.cuh
 create mode 100644 cpp/include/raft/stats/detail/weighted_mean.cuh
 create mode 100644 cpp/include/raft/stats/histogram.hpp
 create mode 100644 cpp/include/raft/stats/minmax.hpp
 create mode 100644 cpp/include/raft/stats/weighted_mean.hpp
 create mode 100644 cpp/test/common/seive.cu
 create mode 100644 cpp/test/stats/cov.cu
 create mode 100644 cpp/test/stats/histogram.cu
 create mode 100644 cpp/test/stats/minmax.cu
 create mode 100644 cpp/test/stats/weighted_mean.cu

diff --git a/cpp/include/raft/common/seive.cuh b/cpp/include/raft/common/seive.cuh
new file mode 100644
index 0000000000..580d9d91cb
--- /dev/null
+++ b/cpp/include/raft/common/seive.cuh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <vector>
+
+// Taken from:
+//  https://github.com/teju85/programming/blob/master/euler/include/seive.h
+
+namespace raft {
+namespace common {
+
+/**
+ * @brief Implementation of 'Seive of Eratosthenes'
+ */
+class Seive {
+ public:
+  /**
+   * @param _num number of integers for which seive is needed
+   */
+  Seive(unsigned _num)
+  {
+    N = _num;
+    generateSeive();
+  }
+
+  /**
+   * @brief Check whether a number is prime or not
+   * @param num number to be checked
+   * @return true if the 'num' is prime, else false
+   */
+  bool isPrime(unsigned num) const
+  {
+    unsigned mask, pos;
+    if (num <= 1) { return false; }
+    if (num == 2) { return true; }
+    if (!(num & 1)) { return false; }
+    getMaskPos(num, mask, pos);
+    return (seive[pos] & mask);
+  }
+
+ private:
+  void generateSeive()
+  {
+    auto sqN  = fastIntSqrt(N);
+    auto size = raft::ceildiv<unsigned>(N, sizeof(unsigned) * 8);
+    seive.resize(size);
+    // assume all to be primes initially
+    for (auto& itr : seive) {
+      itr = 0xffffffffu;
+    }
+    unsigned cid  = 0;
+    unsigned cnum = getNum(cid);
+    while (cnum <= sqN) {
+      do {
+        ++cid;
+        cnum = getNum(cid);
+        if (isPrime(cnum)) { break; }
+      } while (cnum <= sqN);
+      auto cnum2 = cnum << 1;
+      // 'unmark' all the 'odd' multiples of the current prime
+      for (unsigned i = 3, num = i * cnum; num <= N; i += 2, num += cnum2) {
+        unmark(num);
+      }
+    }
+  }
+
+  unsigned getId(unsigned num) const { return (num >> 1); }
+
+  unsigned getNum(unsigned id) const
+  {
+    if (id == 0) { return 2; }
+    return ((id << 1) + 1);
+  }
+
+  void getMaskPos(unsigned num, unsigned& mask, unsigned& pos) const
+  {
+    pos  = getId(num);
+    mask = 1 << (pos & 0x1f);
+    pos >>= 5;
+  }
+
+  void unmark(unsigned num)
+  {
+    unsigned mask, pos;
+    getMaskPos(num, mask, pos);
+    seive[pos] &= ~mask;
+  }
+
+  // REF: http://www.azillionmonkeys.com/qed/ulerysqroot.pdf
+  unsigned fastIntSqrt(unsigned val)
+  {
+    unsigned g = 0;
+    auto bshft = 15u, b = 1u << bshft;
+    do {
+      unsigned temp = ((g << 1) + b) << bshft--;
+      if (val >= temp) {
+        g += b;
+        val -= temp;
+      }
+    } while (b >>= 1);
+    return g;
+  }
+
+  /** find all primes till this number */
+  unsigned N;
+  /** the seive */
+  std::vector<unsigned> seive;
+};
+};  // namespace common
+};  // namespace raft
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
new file mode 100644
index 0000000000..dc5bc63ee8
--- /dev/null
+++ b/cpp/include/raft/stats/cov.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/cov.cuh>
+namespace raft {
+namespace stats {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
new file mode 100644
index 0000000000..7e3fc701a1
--- /dev/null
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/gemm.hpp>
+#include <raft/stats/mean_center.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  if (stable) {
+    cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+    // since mean operation is assumed to be along a given column, broadcast
+    // must be along rows!
+    raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream);
+    Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N));
+    Type beta  = Type(0);
+    if (rowMajor) {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       D,
+                                                       D,
+                                                       N,
+                                                       &alpha,
+                                                       data,
+                                                       D,
+                                                       data,
+                                                       D,
+                                                       &beta,
+                                                       covar,
+                                                       D,
+                                                       stream));
+    } else {
+      raft::linalg::gemm(
+        handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+    }
+  } else {
+    ///@todo: implement this using cutlass + customized epilogue!
+    ASSERT(false, "cov: Implement stable=false case!");
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
new file mode 100644
index 0000000000..8c69ba1459
--- /dev/null
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/common/seive.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/vectorized.cuh>
+#include <stdint.h>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/** Default mapper which just returns the value of the data itself */
+template <typename DataT, typename IdxT>
+struct IdentityBinner {
+  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
+};
+
+/** Types of support histogram implementations */
+enum HistType {
+  /** shared mem atomics but with bins to be 1b int's */
+  HistTypeSmemBits1 = 1,
+  /** shared mem atomics but with bins to be 2b int's */
+  HistTypeSmemBits2 = 2,
+  /** shared mem atomics but with bins to be 4b int's */
+  HistTypeSmemBits4 = 4,
+  /** shared mem atomics but with bins to ba 1B int's */
+  HistTypeSmemBits8 = 8,
+  /** shared mem atomics but with bins to be 2B int's */
+  HistTypeSmemBits16 = 16,
+  /** use only global atomics */
+  HistTypeGmem,
+  /** uses shared mem atomics to reduce global traffic */
+  HistTypeSmem,
+  /**
+   * uses shared mem atomics with match_any intrinsic to further reduce shared
+   * memory traffic. This can only be enabled on Volta and later architectures.
+   * If one tries to enable this for older arch's, it will fall back to
+   * `HistTypeSmem`.
+   * @note This is to be used only when the input dataset leads to a lot of
+   *       repetitions in a given warp, else, this algo can be much slower than
+   *       `HistTypeSmem`!
+   */
+  HistTypeSmemMatchAny,
+  /** builds a hashmap of active bins in shared mem */
+  HistTypeSmemHash,
+  /** decide at runtime the best algo for the given inputs */
+  HistTypeAuto
+};
+
+static const int ThreadsPerBlock = 256;
+
+template <typename IdxT, int VecLen>
+dim3 computeGridDim(IdxT nrows, IdxT ncols, const void* kernel)
+{
+  int occupancy;
+  RAFT_CUDA_TRY(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, ThreadsPerBlock, 0));
+  const auto maxBlks = occupancy * raft::getMultiProcessorCount();
+  int nblksx         = raft::ceildiv<int>(VecLen ? nrows / VecLen : nrows, ThreadsPerBlock);
+  // for cases when there aren't a lot of blocks for computing one histogram
+  nblksx = std::min(nblksx, maxBlks);
+  return dim3(nblksx, ncols);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, typename CoreOp>
+DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, CoreOp op, IdxT col)
+{
+  IdxT offset = col * nrows;
+  auto bdim   = IdxT(blockDim.x);
+  IdxT tid    = threadIdx.x + bdim * blockIdx.x;
+  tid *= VecLen;
+  IdxT stride = bdim * gridDim.x * VecLen;
+  int nCeil   = raft::alignTo<int>(nrows, stride);
+  typedef raft::TxN_t<DataT, VecLen> VecType;
+  VecType a;
+  for (auto i = tid; i < nCeil; i += stride) {
+    if (i < nrows) { a.load(data, offset + i); }
+#pragma unroll
+    for (int j = 0; j < VecLen; ++j) {
+      int binId = binner(a.val.data[j], i + j, col);
+      op(binId, i + j, col);
+    }
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+__global__ void gmemHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+    auto binOffset = col * nbins;
+#if __CUDA_ARCH__ < 700
+    raft::myAtomicAdd(bins + binOffset + binId, 1);
+#else
+    auto amask  = __activemask();
+    auto mask   = __match_any_sync(amask, binId);
+    auto leader = __ffs(mask) - 1;
+    if (raft::laneId() == leader) { raft::myAtomicAdd(bins + binOffset + binId, __popc(mask)); }
+#endif  // __CUDA_ARCH__
+  };
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, blockIdx.y);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void gmemHist(int* bins,
+              IdxT nbins,
+              const DataT* data,
+              IdxT nrows,
+              IdxT ncols,
+              BinnerOp binner,
+              cudaStream_t stream)
+{
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)gmemHistKernel<DataT, BinnerOp, IdxT, VecLen>);
+  gmemHistKernel<DataT, BinnerOp, IdxT, VecLen>
+    <<<blks, ThreadsPerBlock, 0, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
+__global__ void smemHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  extern __shared__ unsigned sbins[];
+  for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
+    sbins[i] = 0;
+  }
+  __syncthreads();
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+#if __CUDA_ARCH__ < 700
+    raft::myAtomicAdd<unsigned int>(sbins + binId, 1);
+#else
+    if (UseMatchAny) {
+      auto amask  = __activemask();
+      auto mask   = __match_any_sync(amask, binId);
+      auto leader = __ffs(mask) - 1;
+      if (raft::laneId() == leader) {
+        raft::myAtomicAdd<unsigned int>(sbins + binId, __popc(mask));
+      }
+    } else {
+      raft::myAtomicAdd<unsigned int>(sbins + binId, 1);
+    }
+#endif  // __CUDA_ARCH__
+  };
+  IdxT col = blockIdx.y;
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  auto binOffset = col * nbins;
+  for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
+    auto val = sbins[i];
+    if (val > 0) { raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binOffset + i, val); }
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
+void smemHist(int* bins,
+              IdxT nbins,
+              const DataT* data,
+              IdxT nrows,
+              IdxT ncols,
+              BinnerOp binner,
+              cudaStream_t stream)
+{
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)smemHistKernel<DataT, BinnerOp, IdxT, VecLen, UseMatchAny>);
+  size_t smemSize = nbins * sizeof(unsigned);
+  smemHistKernel<DataT, BinnerOp, IdxT, VecLen, UseMatchAny>
+    <<<blks, ThreadsPerBlock, smemSize, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+template <unsigned _BIN_BITS>
+struct BitsInfo {
+  static unsigned const BIN_BITS  = _BIN_BITS;
+  static unsigned const WORD_BITS = sizeof(unsigned) * 8;
+  static unsigned const WORD_BINS = WORD_BITS / BIN_BITS;
+  static unsigned const BIN_MASK  = (1 << BIN_BITS) - 1;
+};
+
+template <unsigned BIN_BITS>
+DI void incrementBin(unsigned* sbins, int* bins, int nbins, int binId)
+{
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto iword    = binId / Bits::WORD_BINS;
+  auto ibin     = binId % Bits::WORD_BINS;
+  auto sh       = ibin * Bits::BIN_BITS;
+  auto old_word = atomicAdd(sbins + iword, unsigned(1 << sh));
+  auto new_word = old_word + unsigned(1 << sh);
+  if ((new_word >> sh & Bits::BIN_MASK) != 0) return;
+  // overflow
+  raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binId, Bits::BIN_MASK + 1);
+  for (int dbin = 1; ibin + dbin < Bits::WORD_BINS && binId + dbin < nbins; ++dbin) {
+    auto sh1 = (ibin + dbin) * Bits::BIN_BITS;
+    if ((new_word >> sh1 & Bits::BIN_MASK) == 0) {
+      // overflow
+      raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binId + dbin, Bits::BIN_MASK);
+    } else {
+      // correction
+      raft::myAtomicAdd(bins + binId + dbin, -1);
+      break;
+    }
+  }
+}
+
+template <>
+DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId)
+{
+  typedef BitsInfo<1> Bits;
+  auto iword    = binId / Bits::WORD_BITS;
+  auto sh       = binId % Bits::WORD_BITS;
+  auto old_word = atomicXor(sbins + iword, unsigned(1 << sh));
+  if ((old_word >> sh & 1) != 0) raft::myAtomicAdd(bins + binId, 2);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
+__global__ void smemBitsHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  extern __shared__ unsigned sbins[];
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto nwords = raft::ceildiv<int>(nbins, Bits::WORD_BINS);
+  for (auto j = threadIdx.x; j < nwords; j += blockDim.x) {
+    sbins[j] = 0;
+  }
+  __syncthreads();
+  IdxT col       = blockIdx.y;
+  IdxT binOffset = col * nbins;
+  auto op        = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+    incrementBin<Bits::BIN_BITS>(sbins, bins + binOffset, (int)nbins, binId);
+  };
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  for (auto j = threadIdx.x; j < (int)nbins; j += blockDim.x) {
+    auto shift = j % Bits::WORD_BINS * Bits::BIN_BITS;
+    int count  = sbins[j / Bits::WORD_BINS] >> shift & Bits::BIN_MASK;
+    if (count > 0) raft::myAtomicAdd(bins + binOffset + j, count);
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
+void smemBitsHist(int* bins,
+                  IdxT nbins,
+                  const DataT* data,
+                  IdxT nrows,
+                  IdxT ncols,
+                  BinnerOp binner,
+                  cudaStream_t stream)
+{
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)smemBitsHistKernel<DataT, BinnerOp, IdxT, Bits::BIN_BITS, VecLen>);
+  size_t smemSize = raft::ceildiv<size_t>(nbins, Bits::WORD_BITS / Bits::BIN_BITS) * sizeof(int);
+  smemBitsHistKernel<DataT, BinnerOp, IdxT, Bits::BIN_BITS, VecLen>
+    <<<blks, ThreadsPerBlock, smemSize, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+#define INVALID_KEY -1
+
+DI void clearHashTable(int2* ht, int hashSize)
+{
+  for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) {
+    ht[i] = {INVALID_KEY, 0};
+  }
+}
+
+DI int findEntry(int2* ht, int hashSize, int binId, int threshold)
+{
+  int idx = binId % hashSize;
+  int t;
+  int count = 0;
+  while ((t = atomicCAS(&(ht[idx].x), INVALID_KEY, binId)) != INVALID_KEY && t != binId) {
+    ++count;
+    if (count >= threshold) {
+      idx = INVALID_KEY;
+      break;
+    }
+    ++idx;
+    if (idx >= hashSize) { idx = 0; }
+  }
+  return idx;
+}
+
+DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col)
+{
+  int binOffset = col * nbins;
+  for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) {
+    if (ht[i].x != INVALID_KEY && ht[i].y > 0) {
+      raft::myAtomicAdd(bins + binOffset + ht[i].x, ht[i].y);
+    }
+    ht[i] = {INVALID_KEY, 0};
+  }
+}
+
+#undef INVALID_KEY
+
+///@todo: honor VecLen template param
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+__global__ void smemHashHistKernel(int* bins,
+                                   const DataT* data,
+                                   IdxT nrows,
+                                   IdxT nbins,
+                                   BinnerOp binner,
+                                   int hashSize,
+                                   int threshold)
+{
+  extern __shared__ int2 ht[];
+  int* needFlush = (int*)&(ht[hashSize]);
+  if (threadIdx.x == 0) { needFlush[0] = 0; }
+  clearHashTable(ht, hashSize);
+  __syncthreads();
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    bool iNeedFlush = false;
+    if (row < nrows) {
+      int hidx = findEntry(ht, hashSize, binId, threshold);
+      if (hidx >= 0) {
+        raft::myAtomicAdd(&(ht[hidx].y), 1);
+      } else {
+        needFlush[0] = 1;
+        iNeedFlush   = true;
+      }
+    }
+    __syncthreads();
+    if (needFlush[0]) {
+      flushHashTable(ht, hashSize, bins, nbins, col);
+      __syncthreads();
+      if (threadIdx.x == 0) { needFlush[0] = 0; }
+      __syncthreads();
+    }
+    if (iNeedFlush) {
+      int hidx = findEntry(ht, hashSize, binId, threshold);
+      // all threads are bound to get one valid entry as all threads in this
+      // block will make forward progress due to the __syncthreads call in the
+      // subsequent iteration
+      raft::myAtomicAdd(&(ht[hidx].y), 1);
+    }
+  };
+  IdxT col = blockIdx.y;
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  flushHashTable(ht, hashSize, bins, nbins, col);
+}
+
+inline int computeHashTableSize()
+{
+  // we shouldn't have this much of shared memory available anytime soon!
+  static const unsigned maxBinsEverPossible = 256 * 1024;
+  static raft::common::Seive primes(maxBinsEverPossible);
+  unsigned smem = raft::getSharedMemPerBlock();
+  // divide-by-2 because hash table entry stores 2 elements: idx and count
+  auto binsPossible = smem / sizeof(unsigned) / 2;
+  for (; binsPossible > 1; --binsPossible) {
+    if (primes.isPrime(binsPossible)) return (int)binsPossible;
+  }
+  return 1;  // should not happen!
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void smemHashHist(int* bins,
+                  IdxT nbins,
+                  const DataT* data,
+                  IdxT nrows,
+                  IdxT ncols,
+                  BinnerOp binner,
+                  cudaStream_t stream)
+{
+  static const int flushThreshold = 10;
+  auto blks                       = computeGridDim<IdxT, 1>(
+    nrows, ncols, (const void*)smemHashHistKernel<DataT, BinnerOp, IdxT, 1>);
+  int hashSize    = computeHashTableSize();
+  size_t smemSize = hashSize * sizeof(int2) + sizeof(int);
+  smemHashHistKernel<DataT, BinnerOp, IdxT, 1><<<blks, ThreadsPerBlock, smemSize, stream>>>(
+    bins, data, nrows, nbins, binner, hashSize, flushThreshold);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void histogramVecLen(HistType type,
+                     int* bins,
+                     IdxT nbins,
+                     const DataT* data,
+                     IdxT nrows,
+                     IdxT ncols,
+                     cudaStream_t stream,
+                     BinnerOp binner)
+{
+  RAFT_CUDA_TRY(cudaMemsetAsync(bins, 0, ncols * nbins * sizeof(int), stream));
+  switch (type) {
+    case HistTypeGmem:
+      gmemHist<DataT, BinnerOp, IdxT, VecLen>(bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmem:
+      smemHist<DataT, BinnerOp, IdxT, VecLen, false>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemMatchAny:
+      smemHist<DataT, BinnerOp, IdxT, VecLen, true>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits16:
+      smemBitsHist<DataT, BinnerOp, IdxT, 16, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits8:
+      smemBitsHist<DataT, BinnerOp, IdxT, 8, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits4:
+      smemBitsHist<DataT, BinnerOp, IdxT, 4, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits2:
+      smemBitsHist<DataT, BinnerOp, IdxT, 2, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits1:
+      smemBitsHist<DataT, BinnerOp, IdxT, 1, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemHash:
+      smemHashHist<DataT, BinnerOp, IdxT, VecLen>(bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    default: ASSERT(false, "histogram: Invalid type passed '%d'!", type);
+  };
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT>
+void histogramImpl(HistType type,
+                   int* bins,
+                   IdxT nbins,
+                   const DataT* data,
+                   IdxT nrows,
+                   IdxT ncols,
+                   cudaStream_t stream,
+                   BinnerOp binner)
+{
+  size_t bytes = nrows * sizeof(DataT);
+  if (nrows <= 0) return;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 16 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 8 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (4 % sizeof(DataT) == 0 && bytes % 4 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 4 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (2 % sizeof(DataT) == 0 && bytes % 2 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 2 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else {
+    histogramVecLen<DataT, BinnerOp, IdxT, 1>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  }
+}
+
+template <typename IdxT>
+HistType selectBestHistAlgo(IdxT nbins)
+{
+  size_t smem         = raft::getSharedMemPerBlock();
+  size_t requiredSize = nbins * sizeof(unsigned);
+  if (requiredSize <= smem) { return HistTypeSmem; }
+  for (int bits = 16; bits >= 1; bits >>= 1) {
+    auto nBytesForBins = raft::ceildiv<size_t>(bits * nbins, 8);
+    requiredSize       = raft::alignTo<size_t>(nBytesForBins, sizeof(unsigned));
+    if (requiredSize <= smem) { return static_cast<HistType>(bits); }
+  }
+  return HistTypeGmem;
+}
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  HistType computedType = type;
+  if (type == HistTypeAuto) { computedType = selectBestHistAlgo(nbins); }
+  histogramImpl<DataT, BinnerOp, IdxT>(
+    computedType, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh
new file mode 100644
index 0000000000..c2b14f1544
--- /dev/null
+++ b/cpp/include/raft/stats/detail/minmax.cuh
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+
+namespace detail {
+
+// TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it
+template <class To, class From>
+constexpr To
+
+bit_cast(const From& from)
+
+  noexcept
+{
+  To to{};
+  static_assert(sizeof(To) == sizeof(From));
+  memcpy(&to, &from, sizeof(To));
+  return to;
+}
+
+}  // namespace detail
+
+template <typename T>
+struct encode_traits {
+};
+
+template <>
+struct encode_traits<float> {
+  using E = int;
+};
+
+template <>
+struct encode_traits<double> {
+  using E = long long;
+};
+
+HDI int encode(float val)
+{
+  int i = detail::bit_cast<int>(val);
+  return i >= 0 ? i : (1 << 31) | ~i;
+}
+
+HDI long long encode(double val)
+{
+  std::int64_t i = detail::bit_cast<std::int64_t>(val);
+  return i >= 0 ? i : (1ULL << 63) | ~i;
+}
+
+HDI float decode(int val)
+{
+  if (val < 0) val = (1 << 31) | ~val;
+  return detail::bit_cast<float>(val);
+}
+
+HDI double decode(long long val)
+{
+  if (val < 0) val = (1ULL << 63) | ~val;
+  return detail::bit_cast<double>(val);
+}
+
+template <typename T, typename E>
+DI T atomicMaxBits(T* address, T val)
+{
+  E old = atomicMax((E*)address, encode(val));
+  return decode(old);
+}
+
+template <typename T, typename E>
+DI T atomicMinBits(T* address, T val)
+{
+  E old = atomicMin((E*)address, encode(val));
+  return decode(old);
+}
+
+template <typename T, typename E>
+__global__ void decodeKernel(T* globalmin, T* globalmax, int ncols)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < ncols) {
+    globalmin[tid] = decode(*(E*)&globalmin[tid]);
+    globalmax[tid] = decode(*(E*)&globalmax[tid]);
+  }
+}
+
+///@todo: implement a proper "fill" kernel
+template <typename T, typename E>
+__global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= ncols) return;
+  *(E*)&globalmin[tid] = encode(init_val);
+  *(E*)&globalmax[tid] = encode(-init_val);
+}
+
+template <typename T, typename E>
+__global__ void minmaxKernel(const T* data,
+                             const unsigned int* rowids,
+                             const unsigned int* colids,
+                             int nrows,
+                             int ncols,
+                             int row_stride,
+                             T* g_min,
+                             T* g_max,
+                             T* sampledcols,
+                             T init_min_val,
+                             int batch_ncols,
+                             int num_batches)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ char shmem[];
+  T* s_min = (T*)shmem;
+  T* s_max = (T*)(shmem + sizeof(T) * batch_ncols);
+
+  int last_batch_ncols = ncols % batch_ncols;
+  if (last_batch_ncols == 0) { last_batch_ncols = batch_ncols; }
+  int orig_batch_ncols = batch_ncols;
+
+  for (int batch_id = 0; batch_id < num_batches; batch_id++) {
+    if (batch_id == num_batches - 1) { batch_ncols = last_batch_ncols; }
+
+    for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) {
+      *(E*)&s_min[i] = encode(init_min_val);
+      *(E*)&s_max[i] = encode(-init_min_val);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < nrows * batch_ncols; i += blockDim.x * gridDim.x) {
+      int col = (batch_id * orig_batch_ncols) + (i / nrows);
+      int row = i % nrows;
+      if (colids != nullptr) { col = colids[col]; }
+      if (rowids != nullptr) { row = rowids[row]; }
+      int index = row + col * row_stride;
+      T coldata = data[index];
+      if (!isnan(coldata)) {
+        // Min max values are saved in shared memory and global memory as per the shuffled colids.
+        atomicMinBits<T, E>(&s_min[(int)(i / nrows)], coldata);
+        atomicMaxBits<T, E>(&s_max[(int)(i / nrows)], coldata);
+      }
+      if (sampledcols != nullptr) { sampledcols[batch_id * orig_batch_ncols + i] = coldata; }
+    }
+    __syncthreads();
+
+    // finally, perform global mem atomics
+    for (int j = threadIdx.x; j < batch_ncols; j += blockDim.x) {
+      atomicMinBits<T, E>(&g_min[batch_id * orig_batch_ncols + j], decode(*(E*)&s_min[j]));
+      atomicMaxBits<T, E>(&g_max[batch_id * orig_batch_ncols + j], decode(*(E*)&s_max[j]));
+    }
+    __syncthreads();
+  }
+}
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  using E    = typename encode_traits<T>::E;
+  int nblks  = raft::ceildiv(ncols, TPB);
+  T init_val = std::numeric_limits<T>::max();
+  minmaxInitKernel<T, E><<<nblks, TPB, 0, stream>>>(ncols, globalmin, globalmax, init_val);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  nblks           = raft::ceildiv(nrows * ncols, TPB);
+  nblks           = min(nblks, 65536);
+  size_t smemSize = sizeof(T) * 2 * ncols;
+
+  // Compute the batch_ncols, in [1, ncols] range, that meet the available
+  // shared memory constraints.
+  auto smemPerBlk = raft::getSharedMemPerBlock();
+  int batch_ncols = min(ncols, (int)(smemPerBlk / (sizeof(T) * 2)));
+  int num_batches = raft::ceildiv(ncols, batch_ncols);
+  smemSize        = sizeof(T) * 2 * batch_ncols;
+
+  minmaxKernel<T, E><<<nblks, TPB, smemSize, stream>>>(data,
+                                                       rowids,
+                                                       colids,
+                                                       nrows,
+                                                       ncols,
+                                                       row_stride,
+                                                       globalmin,
+                                                       globalmax,
+                                                       sampledcols,
+                                                       init_val,
+                                                       batch_ncols,
+                                                       num_batches);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  decodeKernel<T, E><<<nblks, TPB, 0, stream>>>(globalmin, globalmax, ncols);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+};  // end namespace detail
+};  // end namespace stats
+}
+;  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
new file mode 100644
index 0000000000..ca7fc136d3
--- /dev/null
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  // sum the weights & copy back to CPU
+  Type WS = 0;
+  raft::linalg::coalescedReduction(mu, weights, D, 1, (Type)0, stream, false);
+  raft::update_host(&WS, mu, 1, stream);
+
+  raft::linalg::coalescedReduction(
+    mu,
+    data,
+    D,
+    N,
+    (Type)0,
+    stream,
+    false,
+    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [] __device__(Type a, Type b) { return a + b; },
+    [WS] __device__(Type v) { return v / WS; });
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  // sum the weights & copy back to CPU
+  Type WS = 0;
+  raft::linalg::stridedReduction(mu, weights, 1, N, (Type)0, stream, false);
+  raft::update_host(&WS, mu, 1, stream);
+
+  raft::linalg::stridedReduction(
+    mu,
+    data,
+    D,
+    N,
+    (Type)0,
+    stream,
+    false,
+    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [] __device__(Type a, Type b) { return a + b; },
+    [WS] __device__(Type v) { return v / WS; });
+}
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
new file mode 100644
index 0000000000..30e982115a
--- /dev/null
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/histogram.cuh>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
new file mode 100644
index 0000000000..966287bb41
--- /dev/null
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/detail/minmax.cuh>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  detail::minmax<T, TPB>(
+    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
new file mode 100644
index 0000000000..ad90142a08
--- /dev/null
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/weighted_mean.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::colWeightedMean(mu, data, weights, D, N, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 2ace88b498..cd08de629c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -16,6 +16,7 @@
 
 # keep the files in alphabetical order!
 add_executable(test_raft
+    test/common/seive.cu
     test/cudart_utils.cpp
     test/cluster_solvers.cu
     test/distance/dist_adj.cu
@@ -103,11 +104,15 @@ add_executable(test_raft
     test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
+    test/stats/cov.cu
+    test/stats/histogram.cu
     test/stats/mean.cu
     test/stats/meanvar.cu
     test/stats/mean_center.cu
+    test/stats/minmax.cu
     test/stats/stddev.cu
     test/stats/sum.cu
+    test/stats/weighted_mean.cu
     test/test.cpp
 )
 
diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu
new file mode 100644
index 0000000000..ca46397b19
--- /dev/null
+++ b/cpp/test/common/seive.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/common/seive.cuh>
+
+namespace raft {
+namespace common {
+TEST(Seive, Test)
+{
+  Seive s1(32);
+  ASSERT_TRUE(s1.isPrime(17));
+  ASSERT_FALSE(s1.isPrime(28));
+
+  Seive s2(1024 * 1024);
+  ASSERT_TRUE(s2.isPrime(107));
+  ASSERT_FALSE(s2.isPrime(111));
+  ASSERT_TRUE(s2.isPrime(6047));
+}
+
+}  // end namespace common
+}  // end namespace raft
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
new file mode 100644
index 0000000000..92f3101d75
--- /dev/null
+++ b/cpp/test/stats/cov.cu
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+#include <raft/stats/cov.cuh>
+#include <raft/stats/mean.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct CovInputs {
+  T tolerance, mean, var;
+  int rows, cols;
+  bool sample, rowMajor, stable;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const CovInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
+ protected:
+  CovTest()
+    : data(0, stream),
+      mean_act(0, stream),
+      cov_act(0, stream),
+      cov_cm(0, stream),
+      cov_cm_ref(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::handle_t handle;
+    cudaStream_t stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<CovInputs<T>>::GetParam();
+    params.tolerance *= 2;
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    auto len = rows * cols;
+    T var    = params.var;
+    data.resize(len, stream);
+    mean_act.resize(cols, stream);
+    cov_act.resize(cols * cols, stream);
+
+    r.normal(data.data(), len, params.mean, var, stream);
+    raft::stats::mean(
+      mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    cov(handle,
+        cov_act.data(),
+        data.data(),
+        mean_act.data(),
+        cols,
+        rows,
+        params.sample,
+        params.rowMajor,
+        params.stable,
+        stream);
+
+    T data_h[6]       = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0};
+    T cov_cm_ref_h[4] = {4.3333, -2.8333, -2.8333, 2.333};
+
+    cov_cm.resize(4, stream);
+    cov_cm_ref.resize(4, stream);
+    rmm::device_uvector<T> data_cm(6, stream);
+    rmm::device_uvector<T> mean_cm(2, stream);
+
+    raft::update_device(data_cm.data(), data_h, 6, stream);
+    raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
+
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream);
+    cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
+  }
+
+ protected:
+  CovInputs<T> params;
+  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
+  cublasHandle_t handle;
+  cudaStream_t stream = 0;
+};
+
+///@todo: add stable=false after it has been implemented
+const std::vector<CovInputs<float>> inputsf = {
+  {0.03f, 1.f, 2.f, 32 * 1024, 32, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 64, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 128, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 256, true, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 32, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 64, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 128, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 256, false, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 32, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 64, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 128, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 256, true, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 32, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 64, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 128, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 256, false, true, true, 1234ULL}};
+
+const std::vector<CovInputs<double>> inputsd = {
+  {0.03, 1.0, 2.0, 32 * 1024, 32, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 64, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 128, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 256, true, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 32, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 64, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 128, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 256, false, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 32, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 64, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 128, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 256, true, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 32, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 64, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 128, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 256, false, true, true, 1234ULL}};
+
+typedef CovTest<float> CovTestF;
+TEST_P(CovTestF, Result)
+{
+  ASSERT_TRUE(raft::diagonalMatch(params.var * params.var,
+                                  cov_act.data(),
+                                  params.cols,
+                                  params.cols,
+                                  raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef CovTest<double> CovTestD;
+TEST_P(CovTestD, Result)
+{
+  ASSERT_TRUE(raft::diagonalMatch(params.var * params.var,
+                                  cov_act.data(),
+                                  params.cols,
+                                  params.cols,
+                                  raft::CompareApprox<double>(params.tolerance)));
+}
+
+typedef CovTest<float> CovTestSmallF;
+TEST_P(CovTestSmallF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef CovTest<double> CovTestSmallD;
+TEST_P(CovTestSmallD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestD, ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallD, ::testing::ValuesIn(inputsd));
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
new file mode 100644
index 0000000000..60dc5fb909
--- /dev/null
+++ b/cpp/test/stats/histogram.cu
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/histogram.cuh>
+
+namespace raft {
+namespace stats {
+
+// Note: this kernel also updates the input vector to take care of OOB bins!
+__global__ void naiveHistKernel(int* bins, int nbins, int* in, int nrows)
+{
+  int tid        = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride     = blockDim.x * gridDim.x;
+  auto offset    = blockIdx.y * nrows;
+  auto binOffset = blockIdx.y * nbins;
+  for (; tid < nrows; tid += stride) {
+    int id = in[offset + tid];
+    if (id < 0)
+      id = 0;
+    else if (id >= nbins)
+      id = nbins - 1;
+    in[offset + tid] = id;
+    raft::myAtomicAdd(bins + binOffset + id, 1);
+  }
+}
+
+void naiveHist(int* bins, int nbins, int* in, int nrows, int ncols, cudaStream_t stream)
+{
+  const int TPB = 128;
+  int nblksx    = raft::ceildiv(nrows, TPB);
+  dim3 blks(nblksx, ncols);
+  naiveHistKernel<<<blks, TPB, 0, stream>>>(bins, nbins, in, nrows);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+struct HistInputs {
+  int nrows, ncols, nbins;
+  bool isNormal;
+  HistType type;
+  int start, end;
+  unsigned long long int seed;
+};
+
+class HistTest : public ::testing::TestWithParam<HistInputs> {
+ protected:
+  HistTest() : in(0, stream), bins(0, stream), ref_bins(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<HistInputs>::GetParam();
+    raft::random::Rng r(params.seed);
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    int len = params.nrows * params.ncols;
+    in.resize(len, stream);
+    if (params.isNormal) {
+      r.normalInt(in.data(), len, params.start, params.end, stream);
+    } else {
+      r.uniformInt(in.data(), len, params.start, params.end, stream);
+    }
+    bins.resize(params.nbins * params.ncols, stream);
+    ref_bins.resize(params.nbins * params.ncols, stream);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(ref_bins.data(), 0, sizeof(int) * params.nbins * params.ncols, stream));
+    naiveHist(ref_bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream);
+    histogram<int>(
+      params.type, bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream);
+    raft::interruptible::synchronize(stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  cudaStream_t stream = 0;
+  HistInputs params;
+  rmm::device_uvector<int> in, bins, ref_bins;
+};
+
+static const int oneK                = 1024;
+static const int oneM                = oneK * oneK;
+const std::vector<HistInputs> inputs = {
+  {oneM, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+};
+TEST_P(HistTest, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    ref_bins.data(), bins.data(), params.nbins * params.ncols, raft::Compare<int>()));
+}
+INSTANTIATE_TEST_CASE_P(HistTests, HistTest, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
new file mode 100644
index 0000000000..e505f3ed00
--- /dev/null
+++ b/cpp/test/stats/minmax.cu
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+#include <raft/stats/minmax.cuh>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace raft {
+namespace stats {
+
+///@todo: need to add tests for verifying the column subsampling feature
+
+template <typename T>
+struct MinMaxInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const MinMaxInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+__global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= ncols) return;
+  globalmin[tid] = init_val;
+  globalmax[tid] = -init_val;
+}
+
+template <typename T>
+__global__ void naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int col = tid / nrows;
+  if (col < ncols) {
+    T val = data[tid];
+    if (!isnan(val)) {
+      raft::myAtomicMin(&globalmin[col], val);
+      raft::myAtomicMax(&globalmax[col], val);
+    }
+  }
+}
+
+template <typename T>
+void naiveMinMax(
+  const T* data, int nrows, int ncols, T* globalmin, T* globalmax, cudaStream_t stream)
+{
+  const int TPB = 128;
+  int nblks     = raft::ceildiv(ncols, TPB);
+  T init_val    = std::numeric_limits<T>::max();
+  naiveMinMaxInitKernel<<<nblks, TPB, 0, stream>>>(ncols, globalmin, globalmax, init_val);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  nblks = raft::ceildiv(nrows * ncols, TPB);
+  naiveMinMaxKernel<<<nblks, TPB, 0, stream>>>(data, nrows, ncols, globalmin, globalmax);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void nanKernel(T* data, const bool* mask, int len, T nan)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= len) return;
+  if (!mask[tid]) data[tid] = nan;
+}
+
+template <typename T>
+class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
+ protected:
+  MinMaxTest() : minmax_act(0, stream), minmax_ref(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<MinMaxInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.rows * params.cols;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> data(len, stream);
+    rmm::device_uvector<bool> mask(len, stream);
+    minmax_act.resize(2 * params.cols, stream);
+    minmax_ref.resize(2 * params.cols, stream);
+
+    r.normal(data.data(), len, (T)0.0, (T)1.0, stream);
+    T nan_prob = 0.01;
+    r.bernoulli(mask.data(), len, nan_prob, stream);
+    const int TPB = 256;
+    nanKernel<<<raft::ceildiv(len, TPB), TPB, 0, stream>>>(
+      data.data(), mask.data(), len, std::numeric_limits<T>::quiet_NaN());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    naiveMinMax(data.data(),
+                params.rows,
+                params.cols,
+                minmax_ref.data(),
+                minmax_ref.data() + params.cols,
+                stream);
+    minmax<T, 512>(data.data(),
+                   nullptr,
+                   nullptr,
+                   params.rows,
+                   params.cols,
+                   params.rows,
+                   minmax_act.data(),
+                   minmax_act.data() + params.cols,
+                   nullptr,
+                   stream);
+  }
+
+ protected:
+  MinMaxInputs<T> params;
+  rmm::device_uvector<T> minmax_act;
+  rmm::device_uvector<T> minmax_ref;
+  cudaStream_t stream = 0;
+};
+
+const std::vector<MinMaxInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
+                                                  {0.00001f, 1024, 64, 1234ULL},
+                                                  {0.00001f, 1024, 128, 1234ULL},
+                                                  {0.00001f, 1024, 256, 1234ULL},
+                                                  {0.00001f, 1024, 512, 1234ULL},
+                                                  {0.00001f, 1024, 1024, 1234ULL},
+                                                  {0.00001f, 4096, 32, 1234ULL},
+                                                  {0.00001f, 4096, 64, 1234ULL},
+                                                  {0.00001f, 4096, 128, 1234ULL},
+                                                  {0.00001f, 4096, 256, 1234ULL},
+                                                  {0.00001f, 4096, 512, 1234ULL},
+                                                  {0.00001f, 4096, 1024, 1234ULL},
+                                                  {0.00001f, 8192, 32, 1234ULL},
+                                                  {0.00001f, 8192, 64, 1234ULL},
+                                                  {0.00001f, 8192, 128, 1234ULL},
+                                                  {0.00001f, 8192, 256, 1234ULL},
+                                                  {0.00001f, 8192, 512, 1234ULL},
+                                                  {0.00001f, 8192, 1024, 1234ULL},
+                                                  {0.00001f, 1024, 8192, 1234ULL}};
+
+const std::vector<MinMaxInputs<double>> inputsd = {{0.0000001, 1024, 32, 1234ULL},
+                                                   {0.0000001, 1024, 64, 1234ULL},
+                                                   {0.0000001, 1024, 128, 1234ULL},
+                                                   {0.0000001, 1024, 256, 1234ULL},
+                                                   {0.0000001, 1024, 512, 1234ULL},
+                                                   {0.0000001, 1024, 1024, 1234ULL},
+                                                   {0.0000001, 4096, 32, 1234ULL},
+                                                   {0.0000001, 4096, 64, 1234ULL},
+                                                   {0.0000001, 4096, 128, 1234ULL},
+                                                   {0.0000001, 4096, 256, 1234ULL},
+                                                   {0.0000001, 4096, 512, 1234ULL},
+                                                   {0.0000001, 4096, 1024, 1234ULL},
+                                                   {0.0000001, 8192, 32, 1234ULL},
+                                                   {0.0000001, 8192, 64, 1234ULL},
+                                                   {0.0000001, 8192, 128, 1234ULL},
+                                                   {0.0000001, 8192, 256, 1234ULL},
+                                                   {0.0000001, 8192, 512, 1234ULL},
+                                                   {0.0000001, 8192, 1024, 1234ULL},
+                                                   {0.0000001, 1024, 8192, 1234ULL}};
+
+typedef MinMaxTest<float> MinMaxTestF;
+TEST_P(MinMaxTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(),
+                                minmax_act.data(),
+                                2 * params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef MinMaxTest<double> MinMaxTestD;
+TEST_P(MinMaxTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(),
+                                minmax_act.data(),
+                                2 * params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
new file mode 100644
index 0000000000..b3502bc5bc
--- /dev/null
+++ b/cpp/test/stats/weighted_mean.cu
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/weighted_mean.cuh>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct WeightedMeanInputs {
+  T tolerance;
+  int M, N;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const WeightedMeanInputs<T>& I)
+{
+  return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << "}"
+            << std::endl;
+}
+
+///// weighted row-wise mean test and support functions
+template <typename T>
+void naiveRowWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor)
+{
+  int istr = rowMajor ? 1 : M;
+  int jstr = rowMajor ? N : 1;
+
+  // sum the weights
+  T WS = 0;
+  for (int i = 0; i < N; i++)
+    WS += W[i];
+
+  for (int j = 0; j < M; j++) {
+    R[j] = (T)0;
+    for (int i = 0; i < N; i++) {
+      // R[j] += (W[i]*D[i*istr + j*jstr] - R[j])/(T)(i+1);
+      R[j] += (W[i] * D[i * istr + j * jstr]) / WS;
+    }
+  }
+}
+
+template <typename T>
+class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+ protected:
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(cols);
+    dexp.resize(rows);
+    dact.resize(rows);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), cols, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(rows);
+
+    // compute naive result & copy to GPU
+    naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    dexp = hexp;
+
+    // compute ml-prims result
+    rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.N;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
+///// weighted column-wise mean test and support functions
+template <typename T>
+void naiveColWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor)
+{
+  int istr = rowMajor ? 1 : M;
+  int jstr = rowMajor ? N : 1;
+
+  // sum the weights
+  T WS = 0;
+  for (int j = 0; j < M; j++)
+    WS += W[j];
+
+  for (int i = 0; i < N; i++) {
+    R[i] = (T)0;
+    for (int j = 0; j < M; j++) {
+      // R[i] += (W[j]*D[i*istr + j*jstr] - R[i])/(T)(j+1);
+      R[i] += (W[j] * D[i * istr + j * jstr]) / WS;
+    }
+  }
+}
+
+template <typename T>
+class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(rows);
+    dexp.resize(cols);
+    dact.resize(cols);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), rows, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(cols);
+
+    // compute naive result & copy to GPU
+    naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    dexp = hexp;
+
+    // compute ml-prims result
+    colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.M;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
+////// Parameter sets and test instantiation
+static const float tolF  = 128 * std::numeric_limits<float>::epsilon();
+static const double tolD = 256 * std::numeric_limits<double>::epsilon();
+
+const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234},
+                                                        {tolF, 1024, 32, 1234},
+                                                        {tolF, 1024, 64, 1234},
+                                                        {tolF, 1024, 128, 1234},
+                                                        {tolF, 1024, 256, 1234},
+                                                        {tolF, 1024, 32, 1234},
+                                                        {tolF, 1024, 64, 1234},
+                                                        {tolF, 1024, 128, 1234},
+                                                        {tolF, 1024, 256, 1234}};
+
+const std::vector<WeightedMeanInputs<double>> inputsd = {{tolD, 4, 4, 1234},
+                                                         {tolD, 1024, 32, 1234},
+                                                         {tolD, 1024, 64, 1234},
+                                                         {tolD, 1024, 128, 1234},
+                                                         {tolD, 1024, 256, 1234},
+                                                         {tolD, 1024, 32, 1234},
+                                                         {tolD, 1024, 64, 1234},
+                                                         {tolD, 1024, 128, 1234},
+                                                         {tolD, 1024, 256, 1234}};
+
+using RowWeightedMeanTestF = RowWeightedMeanTest<float>;
+TEST_P(RowWeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using RowWeightedMeanTestD = RowWeightedMeanTest<double>;
+TEST_P(RowWeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
+using ColWeightedMeanTestF = ColWeightedMeanTest<float>;
+TEST_P(ColWeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using ColWeightedMeanTestD = ColWeightedMeanTest<double>;
+TEST_P(ColWeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
+};  // end namespace stats
+};  // end namespace raft

From 3297a3dc5c6958e9f1141b1adc03ce31e9b982a7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 17:07:34 -0500
Subject: [PATCH 14/24] Re-routing includes

---
 cpp/test/stats/cov.cu           | 2 +-
 cpp/test/stats/histogram.cu     | 2 +-
 cpp/test/stats/minmax.cu        | 2 +-
 cpp/test/stats/weighted_mean.cu | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 92f3101d75..02e01deec3 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index 60dc5fb909..b89e2ab208 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index e505f3ed00..777ac800e9 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index b3502bc5bc..df77a19d73 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/random/rng.hpp>

From 510188d56531dc753def075c10cde9fc9565002c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 17:28:47 -0500
Subject: [PATCH 15/24] Moving epsilon neighborhood. Need data generators for
 tests

---
 cpp/include/raft/device_utils.cuh             | 108 ++++++++
 .../knn/detail/epsilon_neighborhood.cuh       | 233 ++++++++++++++++++
 .../raft/spatial/knn/epsilon_neighborhood.hpp |  56 +++++
 cpp/test/spatial/epsilon_neighborhood.cu      | 140 +++++++++++
 4 files changed, 537 insertions(+)
 create mode 100644 cpp/include/raft/device_utils.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
 create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
 create mode 100644 cpp/test/spatial/epsilon_neighborhood.cu

diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh
new file mode 100644
index 0000000000..5674e2f1c2
--- /dev/null
+++ b/cpp/include/raft/device_utils.cuh
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <utility>  // pair
+
+namespace raft {
+
+// TODO move to raft https://github.com/rapidsai/raft/issues/90
+/** helper method to get the compute capability version numbers */
+    inline std::pair<int, int> getDeviceCapability()
+    {
+        int devId;
+        RAFT_CUDA_TRY(cudaGetDevice(&devId));
+        int major, minor;
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId));
+        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId));
+        return std::make_pair(major, minor);
+    }
+
+/**
+ * @brief Batched warp-level sum reduction
+ *
+ * @tparam T        data type
+ * @tparam NThreads Number of threads in the warp doing independent reductions
+ *
+ * @param[in] val input value
+ * @return        for the first "group" of threads, the reduced value. All
+ *                others will contain unusable values!
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block and also doesn't support this kind of
+ *       batched reduction operation
+ * @note All threads in the warp must enter this function together
+ *
+ * @todo Expand this to support arbitrary reduction ops
+ */
+    template <typename T, int NThreads>
+    DI T batchedWarpReduce(T val)
+{
+#pragma unroll
+    for (int i = NThreads; i < raft::WarpSize; i <<= 1) {
+    val += raft::shfl(val, raft::laneId() + i);
+}
+return val;
+}
+
+/**
+ * @brief 1-D block-level batched sum reduction
+ *
+ * @tparam T        data type
+ * @tparam NThreads Number of threads in the warp doing independent reductions
+ *
+ * @param val  input value
+ * @param smem shared memory region needed for storing intermediate results. It
+ *             must alteast be of size: `sizeof(T) * nWarps * NThreads`
+ * @return     for the first "group" of threads in the block, the reduced value.
+ *             All others will contain unusable values!
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block and also doesn't support this kind of
+ *       batched reduction operation
+ * @note All threads in the block must enter this function together
+ *
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T, int NThreads>
+DI T batchedBlockReduce(T val, char* smem)
+{
+auto* sTemp                  = reinterpret_cast<T*>(smem);
+constexpr int nGroupsPerWarp = raft::WarpSize / NThreads;
+static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!");
+const int nGroups = (blockDim.x + NThreads - 1) / NThreads;
+const int lid     = raft::laneId();
+const int lgid    = lid % NThreads;
+const int gid     = threadIdx.x / NThreads;
+const auto wrIdx  = (gid / nGroupsPerWarp) * NThreads + lgid;
+const auto rdIdx  = gid * NThreads + lgid;
+for (int i = nGroups; i > 0;) {
+auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp;
+if (gid < iAligned) {
+val = batchedWarpReduce<T, NThreads>(val);
+if (lid < NThreads) sTemp[wrIdx] = val;
+}
+__syncthreads();
+i /= nGroupsPerWarp;
+if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); }
+__syncthreads();
+}
+return val;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
new file mode 100644
index 0000000000..4158ac3179
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/device_utils.cuh>
+#include <raft/linalg/contractions.hpp>
+
+namespace raft {
+    namespace spatial {
+        namespace knn {
+            namespace detail {
+
+        template<typename DataT,
+                typename IdxT,
+                typename Policy,
+                typename BaseClass = raft::linalg::Contractions_NT <DataT, IdxT, Policy>>
+        struct EpsUnexpL2SqNeighborhood : public BaseClass {
+        private:
+            typedef Policy P;
+
+            bool *adj;
+            DataT eps;
+            IdxT *vd;
+
+            char *smem;  // for final reductions
+
+            DataT acc[P::AccRowsPerTh][P::AccColsPerTh];
+
+        public:
+            DI EpsUnexpL2SqNeighborhood(bool *_adj,
+                                        IdxT *_vd,
+                                        const DataT *_x,
+                                        const DataT *_y,
+                                        IdxT _m,
+                                        IdxT _n,
+                                        IdxT _k,
+                                        DataT _eps,
+                                        char *_smem)
+                    : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem) {
+            }
+
+            DI void run() {
+                prolog();
+                loop();
+                epilog();
+            }
+
+        private:
+            DI void prolog() {
+                this->ldgXY(0);
+#pragma unroll
+                for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+                    for (int j = 0; j < P::AccColsPerTh; ++j) {
+                        acc[i][j] = BaseClass::Zero;
+                    }
+                }
+                this->stsXY();
+                __syncthreads();
+                this->pageWr ^= 1;
+            }
+
+            DI void loop() {
+                for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+                    this->ldgXY(kidx);
+                    accumulate();  // on the previous k-block
+                    this->stsXY();
+                    __syncthreads();
+                    this->pageWr ^= 1;
+                    this->pageRd ^= 1;
+                }
+                accumulate();  // last iteration
+            }
+
+            DI void epilog() {
+                IdxT startx = blockIdx.x * P::Mblk + this->accrowid;
+                IdxT starty = blockIdx.y * P::Nblk + this->acccolid;
+                auto lid = raft::laneId();
+                IdxT sums[P::AccColsPerTh];
+#pragma unroll
+                for (int j = 0; j < P::AccColsPerTh; ++j) {
+                    sums[j] = 0;
+                }
+#pragma unroll
+                for (int i = 0; i < P::AccRowsPerTh; ++i) {
+                    auto xid = startx + i * P::AccThRows;
+#pragma unroll
+                    for (int j = 0; j < P::AccColsPerTh; ++j) {
+                        auto yid = starty + j * P::AccThCols;
+                        auto is_neigh = acc[i][j] <= eps;
+                        ///@todo: fix uncoalesced writes using shared mem
+                        if (xid < this->m && yid < this->n) {
+                            adj[xid * this->n + yid] = is_neigh;
+                            sums[j] += is_neigh;
+                        }
+                    }
+                }
+                // perform reduction of adjacency values to compute vertex degrees
+                if (vd != nullptr) { updateVertexDegree(sums); }
+            }
+
+            DI void accumulate() {
+#pragma unroll
+                for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
+                    this->ldsXY(ki);
+#pragma unroll
+                    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+                        for (int j = 0; j < P::AccColsPerTh; ++j) {
+#pragma unroll
+                            for (int v = 0; v < P::Veclen; ++v) {
+                                auto diff = this->regx[i][v] - this->regy[j][v];
+                                acc[i][j] += diff * diff;
+                            }
+                        }
+                    }
+                }
+            }
+
+            DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh]) {
+                __syncthreads();  // so that we can safely reuse smem
+                int gid = threadIdx.x / P::AccThCols;
+                int lid = threadIdx.x % P::AccThCols;
+                auto cidx = IdxT(blockIdx.y) * P::Nblk + lid;
+                IdxT totalSum = 0;
+                // update the individual vertex degrees
+#pragma unroll
+                for (int i = 0; i < P::AccColsPerTh; ++i) {
+                    sums[i] = batchedBlockReduce<IdxT, P::AccThCols>(sums[i], smem);
+                    auto cid = cidx + i * P::AccThCols;
+                    if (gid == 0 && cid < this->n) {
+                        atomicUpdate(cid, sums[i]);
+                        totalSum += sums[i];
+                    }
+                    __syncthreads();  // for safe smem reuse
+                }
+                // update the total edge count
+                totalSum = raft::blockReduce<IdxT>(totalSum, smem);
+                if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); }
+            }
+
+            DI void atomicUpdate(IdxT addrId, IdxT val) {
+                if (sizeof(IdxT) == 4) {
+                    raft::myAtomicAdd<unsigned>((unsigned *) (vd + addrId), val);
+                } else if (sizeof(IdxT) == 8) {
+                    raft::myAtomicAdd<unsigned long long>((unsigned long long *) (vd + addrId), val);
+                }
+            }
+        };  // struct EpsUnexpL2SqNeighborhood
+
+        template<typename DataT, typename IdxT, typename Policy>
+        __global__ __launch_bounds__(Policy::Nthreads,
+        2)
+
+        void epsUnexpL2SqNeighKernel(
+                bool *adj, IdxT *vd, const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, DataT eps) {
+            extern __shared__ char smem[];
+            EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
+            obj.run();
+        }
+
+        template<typename DataT, typename IdxT, int VecLen>
+        void epsUnexpL2SqNeighImpl(bool *adj,
+                                   IdxT *vd,
+                                   const DataT *x,
+                                   const DataT *y,
+                                   IdxT m,
+                                   IdxT n,
+                                   IdxT k,
+                                   DataT eps,
+                                   cudaStream_t stream) {
+            typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy Policy;
+            dim3 grid(raft::ceildiv<int>(m, Policy::Mblk), raft::ceildiv<int>(n, Policy::Nblk));
+            dim3 blk(Policy::Nthreads);
+            epsUnexpL2SqNeighKernel < DataT, IdxT, Policy >
+            <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps);
+            RAFT_CUDA_TRY(cudaGetLastError());
+        }
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  fop    device lambda to do any other custom functions
+ * @param[in]  stream cuda stream
+ */
+        template<typename DataT, typename IdxT>
+        void epsUnexpL2SqNeighborhood(bool *adj,
+                                      IdxT *vd,
+                                      const DataT *x,
+                                      const DataT *y,
+                                      IdxT m,
+                                      IdxT n,
+                                      IdxT k,
+                                      DataT eps,
+                                      cudaStream_t stream) {
+            size_t bytes = sizeof(DataT) * k;
+            if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+                epsUnexpL2SqNeighImpl<DataT, IdxT, 16 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+            } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+                epsUnexpL2SqNeighImpl<DataT, IdxT, 8 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+            } else {
+                epsUnexpL2SqNeighImpl<DataT, IdxT, 1>(adj, vd, x, y, m, n, k, eps, stream);
+            }
+        }
+    }    // namespace detail
+    }   // namespace knn
+    }  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
new file mode 100644
index 0000000000..6ef95dc010
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  fop    device lambda to do any other custom functions
+ * @param[in]  stream cuda stream
+ */
+template<typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool *adj,
+                              IdxT *vd,
+                              const DataT *x,
+                              const DataT *y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream) {
+    detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
+}
+}   // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
new file mode 100644
index 0000000000..be62cf0208
--- /dev/null
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <raft/spatial/knn/epsilon_neighborhood.hpp>
+#include <gtest/gtest.h>
+#include <memory>
+#include <raft/cudart_utils.h>
+#include <random/make_blobs.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+        template<typename T, typename IdxT>
+        struct EpsInputs {
+            IdxT n_row, n_col, n_centers, n_batches;
+            T eps;
+        };
+
+        template<typename T, typename IdxT>
+        ::std::ostream &operator<<(::std::ostream &os, const EpsInputs<T, IdxT> &p) {
+            return os;
+        }
+
+        template<typename T, typename IdxT>
+        class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
+        protected:
+            EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {}
+
+            void SetUp() override {
+                param = ::testing::TestWithParam<EpsInputs<T, IdxT>>::GetParam();
+                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+                data.resize(param.n_row * param.n_col, stream);
+                labels.resize(param.n_row, stream);
+                batchSize = param.n_row / param.n_batches;
+                adj.resize(param.n_row * batchSize, stream);
+                vd.resize(batchSize + 1, stream);
+                RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream));
+                Random::make_blobs<T, IdxT>(data.data(),
+                                            labels.data(),
+                                            param.n_row,
+                                            param.n_col,
+                                            param.n_centers,
+                                            stream,
+                                            true,
+                                            nullptr,
+                                            nullptr,
+                                            T(0.01),
+                                            false);
+            }
+
+            void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+            EpsInputs<T, IdxT> param;
+            cudaStream_t stream = 0;
+            rmm::device_uvector <T> data;
+            rmm::device_uvector<bool> adj;
+            rmm::device_uvector <IdxT> labels, vd;
+            IdxT batchSize;
+        };  // class EpsNeighTest
+
+        const std::vector <EpsInputs<float, int>> inputsfi = {
+                {15000, 16,    5,  1, 2.f},
+                {14000, 16,    5,  1, 2.f},
+                {15000, 17,    5,  1, 2.f},
+                {14000, 17,    5,  1, 2.f},
+                {15000, 18,    5,  1, 2.f},
+                {14000, 18,    5,  1, 2.f},
+                {15000, 32,    5,  1, 2.f},
+                {14000, 32,    5,  1, 2.f},
+                {20000, 10000, 10, 1, 2.f},
+                {20000, 10000, 10, 2, 2.f},
+        };
+        typedef EpsNeighTest<float, int> EpsNeighTestFI;
+        TEST_P(EpsNeighTestFI, Result
+        ) {
+        for (
+        int i = 0;
+        i<param.
+        n_batches;
+        ++i) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream)
+        );
+        RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream)
+        );
+        epsUnexpL2SqNeighborhood<float, int>(adj
+        .
+
+        data(),
+                vd
+
+        .
+
+        data(),
+                data
+
+        .
+
+        data(),
+                data
+
+        .
+
+        data()
+
+        + (
+        i *batchSize
+        * param.n_col),
+        param.n_row,
+        batchSize,
+        param.n_col,
+        param.
+        eps *param
+        .eps,
+        stream);
+        ASSERT_TRUE(raft::devArrMatch(
+                param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare<int>(), stream)
+        );
+    }
+}
+INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi)
+);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft

From 5a770e9a56b4094c6275ec445723124454574d99 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 19:37:25 -0500
Subject: [PATCH 16/24] Finishing out the move

---
 cpp/include/raft/device_utils.cuh             |  70 ++--
 .../knn/detail/epsilon_neighborhood.cuh       | 374 +++++++++---------
 .../raft/spatial/knn/epsilon_neighborhood.hpp |  17 +-
 cpp/include/raft/stats/common.hpp             |  67 ++++
 cpp/include/raft/stats/detail/histogram.cuh   |  39 +-
 cpp/include/raft/stats/detail/minmax.cuh      |  12 +-
 cpp/include/raft/stats/histogram.hpp          |   2 +
 cpp/test/spatial/epsilon_neighborhood.cu      |   2 +-
 cpp/test/stats/cov.cu                         |   2 +-
 cpp/test/stats/histogram.cu                   |   2 +-
 cpp/test/stats/minmax.cu                      |   2 +-
 cpp/test/stats/weighted_mean.cu               |   2 +-
 12 files changed, 313 insertions(+), 278 deletions(-)
 create mode 100644 cpp/include/raft/stats/common.hpp

diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh
index 5674e2f1c2..d89a484109 100644
--- a/cpp/include/raft/device_utils.cuh
+++ b/cpp/include/raft/device_utils.cuh
@@ -23,15 +23,15 @@ namespace raft {
 
 // TODO move to raft https://github.com/rapidsai/raft/issues/90
 /** helper method to get the compute capability version numbers */
-    inline std::pair<int, int> getDeviceCapability()
-    {
-        int devId;
-        RAFT_CUDA_TRY(cudaGetDevice(&devId));
-        int major, minor;
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId));
-        RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId));
-        return std::make_pair(major, minor);
-    }
+inline std::pair<int, int> getDeviceCapability()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int major, minor;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId));
+  return std::make_pair(major, minor);
+}
 
 /**
  * @brief Batched warp-level sum reduction
@@ -50,14 +50,14 @@ namespace raft {
  *
  * @todo Expand this to support arbitrary reduction ops
  */
-    template <typename T, int NThreads>
-    DI T batchedWarpReduce(T val)
+template <typename T, int NThreads>
+DI T batchedWarpReduce(T val)
 {
 #pragma unroll
-    for (int i = NThreads; i < raft::WarpSize; i <<= 1) {
+  for (int i = NThreads; i < raft::WarpSize; i <<= 1) {
     val += raft::shfl(val, raft::laneId() + i);
-}
-return val;
+  }
+  return val;
 }
 
 /**
@@ -82,27 +82,27 @@ return val;
 template <typename T, int NThreads>
 DI T batchedBlockReduce(T val, char* smem)
 {
-auto* sTemp                  = reinterpret_cast<T*>(smem);
-constexpr int nGroupsPerWarp = raft::WarpSize / NThreads;
-static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!");
-const int nGroups = (blockDim.x + NThreads - 1) / NThreads;
-const int lid     = raft::laneId();
-const int lgid    = lid % NThreads;
-const int gid     = threadIdx.x / NThreads;
-const auto wrIdx  = (gid / nGroupsPerWarp) * NThreads + lgid;
-const auto rdIdx  = gid * NThreads + lgid;
-for (int i = nGroups; i > 0;) {
-auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp;
-if (gid < iAligned) {
-val = batchedWarpReduce<T, NThreads>(val);
-if (lid < NThreads) sTemp[wrIdx] = val;
-}
-__syncthreads();
-i /= nGroupsPerWarp;
-if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); }
-__syncthreads();
-}
-return val;
+  auto* sTemp                  = reinterpret_cast<T*>(smem);
+  constexpr int nGroupsPerWarp = raft::WarpSize / NThreads;
+  static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!");
+  const int nGroups = (blockDim.x + NThreads - 1) / NThreads;
+  const int lid     = raft::laneId();
+  const int lgid    = lid % NThreads;
+  const int gid     = threadIdx.x / NThreads;
+  const auto wrIdx  = (gid / nGroupsPerWarp) * NThreads + lgid;
+  const auto rdIdx  = gid * NThreads + lgid;
+  for (int i = nGroups; i > 0;) {
+    auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp;
+    if (gid < iAligned) {
+      val = batchedWarpReduce<T, NThreads>(val);
+      if (lid < NThreads) sTemp[wrIdx] = val;
+    }
+    __syncthreads();
+    i /= nGroupsPerWarp;
+    if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); }
+    __syncthreads();
+  }
+  return val;
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 4158ac3179..3b4a8d4174 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -20,176 +20,185 @@
 #include <raft/linalg/contractions.hpp>
 
 namespace raft {
-    namespace spatial {
-        namespace knn {
-            namespace detail {
-
-        template<typename DataT,
-                typename IdxT,
-                typename Policy,
-                typename BaseClass = raft::linalg::Contractions_NT <DataT, IdxT, Policy>>
-        struct EpsUnexpL2SqNeighborhood : public BaseClass {
-        private:
-            typedef Policy P;
-
-            bool *adj;
-            DataT eps;
-            IdxT *vd;
-
-            char *smem;  // for final reductions
-
-            DataT acc[P::AccRowsPerTh][P::AccColsPerTh];
-
-        public:
-            DI EpsUnexpL2SqNeighborhood(bool *_adj,
-                                        IdxT *_vd,
-                                        const DataT *_x,
-                                        const DataT *_y,
-                                        IdxT _m,
-                                        IdxT _n,
-                                        IdxT _k,
-                                        DataT _eps,
-                                        char *_smem)
-                    : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem) {
-            }
-
-            DI void run() {
-                prolog();
-                loop();
-                epilog();
-            }
-
-        private:
-            DI void prolog() {
-                this->ldgXY(0);
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename DataT,
+          typename IdxT,
+          typename Policy,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy>>
+struct EpsUnexpL2SqNeighborhood : public BaseClass {
+ private:
+  typedef Policy P;
+
+  bool* adj;
+  DataT eps;
+  IdxT* vd;
+
+  char* smem;  // for final reductions
+
+  DataT acc[P::AccRowsPerTh][P::AccColsPerTh];
+
+ public:
+  DI EpsUnexpL2SqNeighborhood(bool* _adj,
+                              IdxT* _vd,
+                              const DataT* _x,
+                              const DataT* _y,
+                              IdxT _m,
+                              IdxT _n,
+                              IdxT _k,
+                              DataT _eps,
+                              char* _smem)
+    : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem)
+  {
+  }
+
+  DI void run()
+  {
+    prolog();
+    loop();
+    epilog();
+  }
+
+ private:
+  DI void prolog()
+  {
+    this->ldgXY(0);
 #pragma unroll
-                for (int i = 0; i < P::AccRowsPerTh; ++i) {
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-                    for (int j = 0; j < P::AccColsPerTh; ++j) {
-                        acc[i][j] = BaseClass::Zero;
-                    }
-                }
-                this->stsXY();
-                __syncthreads();
-                this->pageWr ^= 1;
-            }
-
-            DI void loop() {
-                for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-                    this->ldgXY(kidx);
-                    accumulate();  // on the previous k-block
-                    this->stsXY();
-                    __syncthreads();
-                    this->pageWr ^= 1;
-                    this->pageRd ^= 1;
-                }
-                accumulate();  // last iteration
-            }
-
-            DI void epilog() {
-                IdxT startx = blockIdx.x * P::Mblk + this->accrowid;
-                IdxT starty = blockIdx.y * P::Nblk + this->acccolid;
-                auto lid = raft::laneId();
-                IdxT sums[P::AccColsPerTh];
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        acc[i][j] = BaseClass::Zero;
+      }
+    }
+    this->stsXY();
+    __syncthreads();
+    this->pageWr ^= 1;
+  }
+
+  DI void loop()
+  {
+    for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+      this->ldgXY(kidx);
+      accumulate();  // on the previous k-block
+      this->stsXY();
+      __syncthreads();
+      this->pageWr ^= 1;
+      this->pageRd ^= 1;
+    }
+    accumulate();  // last iteration
+  }
+
+  DI void epilog()
+  {
+    IdxT startx = blockIdx.x * P::Mblk + this->accrowid;
+    IdxT starty = blockIdx.y * P::Nblk + this->acccolid;
+    auto lid    = raft::laneId();
+    IdxT sums[P::AccColsPerTh];
 #pragma unroll
-                for (int j = 0; j < P::AccColsPerTh; ++j) {
-                    sums[j] = 0;
-                }
+    for (int j = 0; j < P::AccColsPerTh; ++j) {
+      sums[j] = 0;
+    }
 #pragma unroll
-                for (int i = 0; i < P::AccRowsPerTh; ++i) {
-                    auto xid = startx + i * P::AccThRows;
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      auto xid = startx + i * P::AccThRows;
 #pragma unroll
-                    for (int j = 0; j < P::AccColsPerTh; ++j) {
-                        auto yid = starty + j * P::AccThCols;
-                        auto is_neigh = acc[i][j] <= eps;
-                        ///@todo: fix uncoalesced writes using shared mem
-                        if (xid < this->m && yid < this->n) {
-                            adj[xid * this->n + yid] = is_neigh;
-                            sums[j] += is_neigh;
-                        }
-                    }
-                }
-                // perform reduction of adjacency values to compute vertex degrees
-                if (vd != nullptr) { updateVertexDegree(sums); }
-            }
-
-            DI void accumulate() {
-#pragma unroll
-                for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
-                    this->ldsXY(ki);
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto yid      = starty + j * P::AccThCols;
+        auto is_neigh = acc[i][j] <= eps;
+        ///@todo: fix uncoalesced writes using shared mem
+        if (xid < this->m && yid < this->n) {
+          adj[xid * this->n + yid] = is_neigh;
+          sums[j] += is_neigh;
+        }
+      }
+    }
+    // perform reduction of adjacency values to compute vertex degrees
+    if (vd != nullptr) { updateVertexDegree(sums); }
+  }
+
+  DI void accumulate()
+  {
 #pragma unroll
-                    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+    for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
+      this->ldsXY(ki);
 #pragma unroll
-                        for (int j = 0; j < P::AccColsPerTh; ++j) {
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-                            for (int v = 0; v < P::Veclen; ++v) {
-                                auto diff = this->regx[i][v] - this->regy[j][v];
-                                acc[i][j] += diff * diff;
-                            }
-                        }
-                    }
-                }
-            }
-
-            DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh]) {
-                __syncthreads();  // so that we can safely reuse smem
-                int gid = threadIdx.x / P::AccThCols;
-                int lid = threadIdx.x % P::AccThCols;
-                auto cidx = IdxT(blockIdx.y) * P::Nblk + lid;
-                IdxT totalSum = 0;
-                // update the individual vertex degrees
+        for (int j = 0; j < P::AccColsPerTh; ++j) {
 #pragma unroll
-                for (int i = 0; i < P::AccColsPerTh; ++i) {
-                    sums[i] = batchedBlockReduce<IdxT, P::AccThCols>(sums[i], smem);
-                    auto cid = cidx + i * P::AccThCols;
-                    if (gid == 0 && cid < this->n) {
-                        atomicUpdate(cid, sums[i]);
-                        totalSum += sums[i];
-                    }
-                    __syncthreads();  // for safe smem reuse
-                }
-                // update the total edge count
-                totalSum = raft::blockReduce<IdxT>(totalSum, smem);
-                if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); }
-            }
-
-            DI void atomicUpdate(IdxT addrId, IdxT val) {
-                if (sizeof(IdxT) == 4) {
-                    raft::myAtomicAdd<unsigned>((unsigned *) (vd + addrId), val);
-                } else if (sizeof(IdxT) == 8) {
-                    raft::myAtomicAdd<unsigned long long>((unsigned long long *) (vd + addrId), val);
-                }
-            }
-        };  // struct EpsUnexpL2SqNeighborhood
-
-        template<typename DataT, typename IdxT, typename Policy>
-        __global__ __launch_bounds__(Policy::Nthreads,
-        2)
-
-        void epsUnexpL2SqNeighKernel(
-                bool *adj, IdxT *vd, const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, DataT eps) {
-            extern __shared__ char smem[];
-            EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
-            obj.run();
-        }
-
-        template<typename DataT, typename IdxT, int VecLen>
-        void epsUnexpL2SqNeighImpl(bool *adj,
-                                   IdxT *vd,
-                                   const DataT *x,
-                                   const DataT *y,
-                                   IdxT m,
-                                   IdxT n,
-                                   IdxT k,
-                                   DataT eps,
-                                   cudaStream_t stream) {
-            typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy Policy;
-            dim3 grid(raft::ceildiv<int>(m, Policy::Mblk), raft::ceildiv<int>(n, Policy::Nblk));
-            dim3 blk(Policy::Nthreads);
-            epsUnexpL2SqNeighKernel < DataT, IdxT, Policy >
-            <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps);
-            RAFT_CUDA_TRY(cudaGetLastError());
+          for (int v = 0; v < P::Veclen; ++v) {
+            auto diff = this->regx[i][v] - this->regy[j][v];
+            acc[i][j] += diff * diff;
+          }
         }
+      }
+    }
+  }
+
+  DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh])
+  {
+    __syncthreads();  // so that we can safely reuse smem
+    int gid       = threadIdx.x / P::AccThCols;
+    int lid       = threadIdx.x % P::AccThCols;
+    auto cidx     = IdxT(blockIdx.y) * P::Nblk + lid;
+    IdxT totalSum = 0;
+    // update the individual vertex degrees
+#pragma unroll
+    for (int i = 0; i < P::AccColsPerTh; ++i) {
+      sums[i]  = batchedBlockReduce<IdxT, P::AccThCols>(sums[i], smem);
+      auto cid = cidx + i * P::AccThCols;
+      if (gid == 0 && cid < this->n) {
+        atomicUpdate(cid, sums[i]);
+        totalSum += sums[i];
+      }
+      __syncthreads();  // for safe smem reuse
+    }
+    // update the total edge count
+    totalSum = raft::blockReduce<IdxT>(totalSum, smem);
+    if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); }
+  }
+
+  DI void atomicUpdate(IdxT addrId, IdxT val)
+  {
+    if (sizeof(IdxT) == 4) {
+      raft::myAtomicAdd<unsigned>((unsigned*)(vd + addrId), val);
+    } else if (sizeof(IdxT) == 8) {
+      raft::myAtomicAdd<unsigned long long>((unsigned long long*)(vd + addrId), val);
+    }
+  }
+};  // struct EpsUnexpL2SqNeighborhood
+
+template <typename DataT, typename IdxT, typename Policy>
+__global__ __launch_bounds__(Policy::Nthreads, 2)
+
+  void epsUnexpL2SqNeighKernel(
+    bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps)
+{
+  extern __shared__ char smem[];
+  EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
+  obj.run();
+}
+
+template <typename DataT, typename IdxT, int VecLen>
+void epsUnexpL2SqNeighImpl(bool* adj,
+                           IdxT* vd,
+                           const DataT* x,
+                           const DataT* y,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           DataT eps,
+                           cudaStream_t stream)
+{
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy Policy;
+  dim3 grid(raft::ceildiv<int>(m, Policy::Mblk), raft::ceildiv<int>(n, Policy::Nblk));
+  dim3 blk(Policy::Nthreads);
+  epsUnexpL2SqNeighKernel<DataT, IdxT, Policy>
+    <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
 
 /**
  * @brief Computes epsilon neighborhood for the L2-Squared distance metric
@@ -208,26 +217,27 @@ namespace raft {
  * @param[in]  fop    device lambda to do any other custom functions
  * @param[in]  stream cuda stream
  */
-        template<typename DataT, typename IdxT>
-        void epsUnexpL2SqNeighborhood(bool *adj,
-                                      IdxT *vd,
-                                      const DataT *x,
-                                      const DataT *y,
-                                      IdxT m,
-                                      IdxT n,
-                                      IdxT k,
-                                      DataT eps,
-                                      cudaStream_t stream) {
-            size_t bytes = sizeof(DataT) * k;
-            if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
-                epsUnexpL2SqNeighImpl<DataT, IdxT, 16 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
-            } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
-                epsUnexpL2SqNeighImpl<DataT, IdxT, 8 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
-            } else {
-                epsUnexpL2SqNeighImpl<DataT, IdxT, 1>(adj, vd, x, y, m, n, k, eps, stream);
-            }
-        }
-    }    // namespace detail
-    }   // namespace knn
-    }  // namespace spatial
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  size_t bytes = sizeof(DataT) * k;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 16 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 8 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+  } else {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 1>(adj, vd, x, y, m, n, k, eps, stream);
+  }
+}
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index 6ef95dc010..a25fd9295c 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -39,18 +39,19 @@ namespace knn {
  * @param[in]  fop    device lambda to do any other custom functions
  * @param[in]  stream cuda stream
  */
-template<typename DataT, typename IdxT>
-void epsUnexpL2SqNeighborhood(bool *adj,
-                              IdxT *vd,
-                              const DataT *x,
-                              const DataT *y,
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
                               IdxT m,
                               IdxT n,
                               IdxT k,
                               DataT eps,
-                              cudaStream_t stream) {
-    detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
+                              cudaStream_t stream)
+{
+  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
 }
-}   // namespace knn
+}  // namespace knn
 }  // namespace spatial
 }  // namespace raft
diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp
new file mode 100644
index 0000000000..765f07a012
--- /dev/null
+++ b/cpp/include/raft/stats/common.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/** Default mapper which just returns the value of the data itself */
+template <typename DataT, typename IdxT>
+struct IdentityBinner {
+  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
+};
+
+/** Types of support histogram implementations */
+enum HistType {
+  /** shared mem atomics but with bins to be 1b int's */
+  HistTypeSmemBits1 = 1,
+  /** shared mem atomics but with bins to be 2b int's */
+  HistTypeSmemBits2 = 2,
+  /** shared mem atomics but with bins to be 4b int's */
+  HistTypeSmemBits4 = 4,
+  /** shared mem atomics but with bins to ba 1B int's */
+  HistTypeSmemBits8 = 8,
+  /** shared mem atomics but with bins to be 2B int's */
+  HistTypeSmemBits16 = 16,
+  /** use only global atomics */
+  HistTypeGmem,
+  /** uses shared mem atomics to reduce global traffic */
+  HistTypeSmem,
+  /**
+   * uses shared mem atomics with match_any intrinsic to further reduce shared
+   * memory traffic. This can only be enabled on Volta and later architectures.
+   * If one tries to enable this for older arch's, it will fall back to
+   * `HistTypeSmem`.
+   * @note This is to be used only when the input dataset leads to a lot of
+   *       repetitions in a given warp, else, this algo can be much slower than
+   *       `HistTypeSmem`!
+   */
+  HistTypeSmemMatchAny,
+  /** builds a hashmap of active bins in shared mem */
+  HistTypeSmemHash,
+  /** decide at runtime the best algo for the given inputs */
+  HistTypeAuto
+};
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
index 8c69ba1459..7c03561002 100644
--- a/cpp/include/raft/stats/detail/histogram.cuh
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -19,6 +19,7 @@
 #include <raft/common/seive.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/stats/common.hpp>
 #include <raft/vectorized.cuh>
 #include <stdint.h>
 
@@ -31,44 +32,6 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-/** Default mapper which just returns the value of the data itself */
-template <typename DataT, typename IdxT>
-struct IdentityBinner {
-  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
-};
-
-/** Types of support histogram implementations */
-enum HistType {
-  /** shared mem atomics but with bins to be 1b int's */
-  HistTypeSmemBits1 = 1,
-  /** shared mem atomics but with bins to be 2b int's */
-  HistTypeSmemBits2 = 2,
-  /** shared mem atomics but with bins to be 4b int's */
-  HistTypeSmemBits4 = 4,
-  /** shared mem atomics but with bins to ba 1B int's */
-  HistTypeSmemBits8 = 8,
-  /** shared mem atomics but with bins to be 2B int's */
-  HistTypeSmemBits16 = 16,
-  /** use only global atomics */
-  HistTypeGmem,
-  /** uses shared mem atomics to reduce global traffic */
-  HistTypeSmem,
-  /**
-   * uses shared mem atomics with match_any intrinsic to further reduce shared
-   * memory traffic. This can only be enabled on Volta and later architectures.
-   * If one tries to enable this for older arch's, it will fall back to
-   * `HistTypeSmem`.
-   * @note This is to be used only when the input dataset leads to a lot of
-   *       repetitions in a given warp, else, this algo can be much slower than
-   *       `HistTypeSmem`!
-   */
-  HistTypeSmemMatchAny,
-  /** builds a hashmap of active bins in shared mem */
-  HistTypeSmemHash,
-  /** decide at runtime the best algo for the given inputs */
-  HistTypeAuto
-};
-
 static const int ThreadsPerBlock = 256;
 
 template <typename IdxT, int VecLen>
diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh
index c2b14f1544..2a4a9bff93 100644
--- a/cpp/include/raft/stats/detail/minmax.cuh
+++ b/cpp/include/raft/stats/detail/minmax.cuh
@@ -23,16 +23,11 @@
 
 namespace raft {
 namespace stats {
-
 namespace detail {
 
 // TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it
 template <class To, class From>
-constexpr To
-
-bit_cast(const From& from)
-
-  noexcept
+constexpr To bit_cast(const From& from) noexcept
 {
   To to{};
   static_assert(sizeof(To) == sizeof(From));
@@ -40,8 +35,6 @@ bit_cast(const From& from)
   return to;
 }
 
-}  // namespace detail
-
 template <typename T>
 struct encode_traits {
 };
@@ -243,5 +236,4 @@ void minmax(const T* data,
 
 };  // end namespace detail
 };  // end namespace stats
-}
-;  // end namespace raft
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index 30e982115a..d4d3b449f7 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/stats/common.hpp>
 #include <raft/stats/detail/histogram.cuh>
 
 // This file is a shameless amalgamation of independent works done by
@@ -25,6 +26,7 @@
 
 namespace raft {
 namespace stats {
+
 /**
  * @brief Perform histogram on the input data. It chooses the right load size
  * based on the input data vector length. It also supports large-bin cases
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
index be62cf0208..7667f742ca 100644
--- a/cpp/test/spatial/epsilon_neighborhood.cu
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <raft/cudart_utils.h>
-#include <random/make_blobs.cuh>
+#include <raft/random/make_blobs.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 02e01deec3..2db64a7999 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include <raft/stats/cov.cuh>
+#include <raft/stats/cov.hpp>
 #include <raft/stats/mean.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index b89e2ab208..ff538fcdca 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
 #include <raft/random/rng.hpp>
-#include <raft/stats/histogram.cuh>
+#include <raft/stats/histogram.hpp>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 777ac800e9..61b16b65ae 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -20,7 +20,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include <raft/stats/minmax.cuh>
+#include <raft/stats/minmax.hpp>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index df77a19d73..ee58747b69 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/random/rng.hpp>
-#include <raft/stats/weighted_mean.cuh>
+#include <raft/stats/weighted_mean.hpp>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 

From 895c1c4e8ff64d065ddd8ae560b09cc0627901bd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 19:43:53 -0500
Subject: [PATCH 17/24] Fixing style

---
 cpp/test/CMakeLists.txt                  |   1 +
 cpp/test/spatial/epsilon_neighborhood.cu | 207 +++++++++++------------
 2 files changed, 99 insertions(+), 109 deletions(-)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index fe51e5af02..430b69341c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -107,6 +107,7 @@ add_executable(test_raft
     test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
     test/spatial/ball_cover.cu
+    test/spatial/epsilon_neighborhood.cu
     test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
index 7667f742ca..b39148957e 100644
--- a/cpp/test/spatial/epsilon_neighborhood.cu
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -15,125 +15,114 @@
  */
 
 #include "test_utils.h"
-#include <raft/spatial/knn/epsilon_neighborhood.hpp>
 #include <gtest/gtest.h>
 #include <memory>
 #include <raft/cudart_utils.h>
 #include <raft/random/make_blobs.hpp>
+#include <raft/spatial/knn/epsilon_neighborhood.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
-        template<typename T, typename IdxT>
-        struct EpsInputs {
-            IdxT n_row, n_col, n_centers, n_batches;
-            T eps;
-        };
-
-        template<typename T, typename IdxT>
-        ::std::ostream &operator<<(::std::ostream &os, const EpsInputs<T, IdxT> &p) {
-            return os;
-        }
-
-        template<typename T, typename IdxT>
-        class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
-        protected:
-            EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {}
-
-            void SetUp() override {
-                param = ::testing::TestWithParam<EpsInputs<T, IdxT>>::GetParam();
-                RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-                data.resize(param.n_row * param.n_col, stream);
-                labels.resize(param.n_row, stream);
-                batchSize = param.n_row / param.n_batches;
-                adj.resize(param.n_row * batchSize, stream);
-                vd.resize(batchSize + 1, stream);
-                RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream));
-                Random::make_blobs<T, IdxT>(data.data(),
-                                            labels.data(),
-                                            param.n_row,
-                                            param.n_col,
-                                            param.n_centers,
-                                            stream,
-                                            true,
-                                            nullptr,
-                                            nullptr,
-                                            T(0.01),
-                                            false);
-            }
-
-            void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-            EpsInputs<T, IdxT> param;
-            cudaStream_t stream = 0;
-            rmm::device_uvector <T> data;
-            rmm::device_uvector<bool> adj;
-            rmm::device_uvector <IdxT> labels, vd;
-            IdxT batchSize;
-        };  // class EpsNeighTest
-
-        const std::vector <EpsInputs<float, int>> inputsfi = {
-                {15000, 16,    5,  1, 2.f},
-                {14000, 16,    5,  1, 2.f},
-                {15000, 17,    5,  1, 2.f},
-                {14000, 17,    5,  1, 2.f},
-                {15000, 18,    5,  1, 2.f},
-                {14000, 18,    5,  1, 2.f},
-                {15000, 32,    5,  1, 2.f},
-                {14000, 32,    5,  1, 2.f},
-                {20000, 10000, 10, 1, 2.f},
-                {20000, 10000, 10, 2, 2.f},
-        };
-        typedef EpsNeighTest<float, int> EpsNeighTestFI;
-        TEST_P(EpsNeighTestFI, Result
-        ) {
-        for (
-        int i = 0;
-        i<param.
-        n_batches;
-        ++i) {
-        RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream)
-        );
-        RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream)
-        );
-        epsUnexpL2SqNeighborhood<float, int>(adj
-        .
-
-        data(),
-                vd
-
-        .
-
-        data(),
-                data
-
-        .
-
-        data(),
-                data
-
-        .
-
-        data()
-
-        + (
-        i *batchSize
-        * param.n_col),
-        param.n_row,
-        batchSize,
-        param.n_col,
-        param.
-        eps *param
-        .eps,
-        stream);
-        ASSERT_TRUE(raft::devArrMatch(
-                param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare<int>(), stream)
-        );
-    }
+template <typename T, typename IdxT>
+struct EpsInputs {
+  IdxT n_row, n_col, n_centers, n_batches;
+  T eps;
+};
+
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const EpsInputs<T, IdxT>& p)
+{
+  return os;
+}
+
+template <typename T, typename IdxT>
+class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
+ protected:
+  EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {}
+
+  void SetUp() override
+  {
+    param = ::testing::TestWithParam<EpsInputs<T, IdxT>>::GetParam();
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    data.resize(param.n_row * param.n_col, stream);
+    labels.resize(param.n_row, stream);
+    batchSize = param.n_row / param.n_batches;
+    adj.resize(param.n_row * batchSize, stream);
+    vd.resize(batchSize + 1, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream));
+    random::make_blobs<T, IdxT>(data.data(),
+                                labels.data(),
+                                param.n_row,
+                                param.n_col,
+                                param.n_centers,
+                                stream,
+                                true,
+                                nullptr,
+                                nullptr,
+                                T(0.01),
+                                false);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  EpsInputs<T, IdxT> param;
+  cudaStream_t stream = 0;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<bool> adj;
+  rmm::device_uvector<IdxT> labels, vd;
+  IdxT batchSize;
+};  // class EpsNeighTest
+
+const std::vector<EpsInputs<float, int>> inputsfi = {
+  {15000, 16, 5, 1, 2.f},
+  {14000, 16, 5, 1, 2.f},
+  {15000, 17, 5, 1, 2.f},
+  {14000, 17, 5, 1, 2.f},
+  {15000, 18, 5, 1, 2.f},
+  {14000, 18, 5, 1, 2.f},
+  {15000, 32, 5, 1, 2.f},
+  {14000, 32, 5, 1, 2.f},
+  {20000, 10000, 10, 1, 2.f},
+  {20000, 10000, 10, 2, 2.f},
+};
+typedef EpsNeighTest<float, int> EpsNeighTestFI;
+TEST_P(EpsNeighTestFI, Result)
+{
+  for (int i = 0; i < param.n_batches; ++i) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream));
+    epsUnexpL2SqNeighborhood<float, int>(adj.
+
+                                         data(),
+                                         vd
+
+                                           .
+
+                                         data(),
+                                         data
+
+                                           .
+
+                                         data(),
+                                         data
+
+                                             .
+
+                                           data()
+
+                                           + (i * batchSize * param.n_col),
+                                         param.n_row,
+                                         batchSize,
+                                         param.n_col,
+                                         param.eps * param.eps,
+                                         stream);
+    ASSERT_TRUE(raft::devArrMatch(
+      param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare<int>(), stream));
+  }
 }
-INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi)
-);
+INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi));
 
 };  // namespace knn
 };  // namespace spatial

From ce425d4d89ebdcf1727f2b068a76364333161245 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 19:56:25 -0500
Subject: [PATCH 18/24] Updating year andeive

---
 cpp/include/raft/common/{seive.cuh => seive.hpp} | 3 +--
 cpp/include/raft/stats/detail/histogram.cuh      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)
 rename cpp/include/raft/common/{seive.cuh => seive.hpp} (97%)

diff --git a/cpp/include/raft/common/seive.cuh b/cpp/include/raft/common/seive.hpp
similarity index 97%
rename from cpp/include/raft/common/seive.cuh
rename to cpp/include/raft/common/seive.hpp
index 580d9d91cb..6d7de24ecd 100644
--- a/cpp/include/raft/common/seive.cuh
+++ b/cpp/include/raft/common/seive.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <raft/cuda_utils.cuh>
 #include <vector>
 
 // Taken from:
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
index 7c03561002..65241f524f 100644
--- a/cpp/include/raft/stats/detail/histogram.cuh
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/common/seive.cuh>
+#include <raft/common/seive.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/stats/common.hpp>

From 3d0b16769578b4c71e6e6fbbe243f6128b9dcc49 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 19:58:08 -0500
Subject: [PATCH 19/24] proper include

---
 cpp/test/spatial/epsilon_neighborhood.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
index b39148957e..33af5726a0 100644
--- a/cpp/test/spatial/epsilon_neighborhood.cu
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <memory>
 #include <raft/cudart_utils.h>

From 368513b14e84aa4af50a3f60adbba794596b7450 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 20:59:19 -0500
Subject: [PATCH 20/24] Proper filename

---
 cpp/test/common/seive.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu
index ca46397b19..8044dbb532 100644
--- a/cpp/test/common/seive.cu
+++ b/cpp/test/common/seive.cu
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/common/seive.cuh>
+#include <raft/common/seive.hpp>
 
 namespace raft {
 namespace common {

From 8b5e5c50006161925739779dd0a07b86c04a1f8b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 10 Feb 2022 22:05:32 -0500
Subject: [PATCH 21/24] Adding missing include

---
 cpp/include/raft/common/seive.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp
index 6d7de24ecd..e613f1e5c2 100644
--- a/cpp/include/raft/common/seive.hpp
+++ b/cpp/include/raft/common/seive.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <raft/cuda_utils.cuh>
 #include <vector>
 
 // Taken from:

From 413b0d462e5a57b2ec0fabb538dd7d2bdfa3dd10 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Feb 2022 15:03:09 -0500
Subject: [PATCH 22/24] Updating rsvd test

---
 cpp/include/raft/random/detail/make_blobs.cuh | 133 +++++++++++++-----
 cpp/test/linalg/rsvd.cu                       | 112 ++++++++-------
 cpp/test/random/make_blobs.cu                 | 110 ++++++---------
 3 files changed, 206 insertions(+), 149 deletions(-)

diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 528d20a284..fff1ab835b 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include "permute.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.hpp>
-#include <raft/random/permute.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
-namespace raft::random {
-namespace detail {
+namespace raft {
+namespace random {
 
-namespace {
+namespace detail {
 
 // generate the labels first and shuffle them instead of shuffling the dataset
 template <typename IdxT>
@@ -90,23 +90,29 @@ DI void get_mu_sigma(DataT& mu,
 }
 
 template <typename DataT, typename IdxT>
-void generate_data(DataT* out,
-                   const IdxT* labels,
-                   IdxT n_rows,
-                   IdxT n_cols,
-                   IdxT n_clusters,
-                   cudaStream_t stream,
-                   bool row_major,
-                   const DataT* centers,
-                   const DataT* cluster_std,
-                   const DataT cluster_std_scalar,
-                   raft::random::Rng& rng)
+__global__ void generate_data_kernel(DataT* out,
+                                     const IdxT* labels,
+                                     IdxT n_rows,
+                                     IdxT n_cols,
+                                     IdxT n_clusters,
+                                     bool row_major,
+                                     const DataT* centers,
+                                     const DataT* cluster_std,
+                                     const DataT cluster_std_scalar,
+                                     raft::random::RngState rng_state)
 {
-  auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) {
+  uint64_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  raft::random::PhiloxGenerator gen(rng_state, tid);
+  const IdxT stride = gridDim.x * blockDim.x;
+  IdxT len          = n_rows * n_cols;
+  for (IdxT idx = tid; idx < len; idx += stride) {
+    DataT val1, val2;
+    gen.next(val1);
+    gen.next(val2);
     DataT mu1, sigma1, mu2, sigma2;
     get_mu_sigma(mu1,
                  sigma1,
-                 idx1,
+                 idx,
                  labels,
                  row_major,
                  centers,
@@ -117,7 +123,7 @@ void generate_data(DataT* out,
                  n_clusters);
     get_mu_sigma(mu2,
                  sigma2,
-                 idx2,
+                 idx + stride,
                  labels,
                  row_major,
                  centers,
@@ -127,12 +133,74 @@ void generate_data(DataT* out,
                  n_cols,
                  n_clusters);
     raft::random::box_muller_transform<DataT>(val1, val2, sigma1, mu1, sigma2, mu2);
-  };
-  rng.custom_distribution2<DataT, DataT, IdxT>(out, n_rows * n_cols, op, stream);
+
+    if (idx < len) out[idx] = val1;
+    idx += stride;
+    if (idx < len) out[idx] = val2;
+  }
 }
 
-}  // namespace
+template <typename DataT, typename IdxT>
+void generate_data(DataT* out,
+                   const IdxT* labels,
+                   IdxT n_rows,
+                   IdxT n_cols,
+                   IdxT n_clusters,
+                   cudaStream_t stream,
+                   bool row_major,
+                   const DataT* centers,
+                   const DataT* cluster_std,
+                   const DataT cluster_std_scalar,
+                   raft::random::RngState& rng_state)
+{
+  IdxT items   = n_rows * n_cols;
+  IdxT nBlocks = (items + 127) / 128;
+  generate_data_kernel<<<nBlocks, 128, 0, stream>>>(out,
+                                                    labels,
+                                                    n_rows,
+                                                    n_cols,
+                                                    n_clusters,
+                                                    row_major,
+                                                    centers,
+                                                    cluster_std,
+                                                    cluster_std_scalar,
+                                                    rng_state);
+}
 
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
 template <typename DataT, typename IdxT>
 void make_blobs_caller(DataT* out,
                        IdxT* labels,
@@ -140,15 +208,15 @@ void make_blobs_caller(DataT* out,
                        IdxT n_cols,
                        IdxT n_clusters,
                        cudaStream_t stream,
-                       bool row_major                   = true,
-                       const DataT* centers             = nullptr,
-                       const DataT* cluster_std         = nullptr,
-                       const DataT cluster_std_scalar   = (DataT)1.0,
-                       bool shuffle                     = true,
-                       DataT center_box_min             = (DataT)-10.0,
-                       DataT center_box_max             = (DataT)10.0,
-                       uint64_t seed                    = 0ULL,
-                       raft::random::GeneratorType type = raft::random::GenPhilox)
+                       bool row_major,
+                       const DataT* centers,
+                       const DataT* cluster_std,
+                       const DataT cluster_std_scalar,
+                       bool shuffle,
+                       DataT center_box_min,
+                       DataT center_box_max,
+                       uint64_t seed,
+                       raft::random::GeneratorType type)
 {
   raft::random::Rng r(seed, type);
   // use the right centers buffer for data generation
@@ -172,8 +240,9 @@ void make_blobs_caller(DataT* out,
                 _centers,
                 cluster_std,
                 cluster_std_scalar,
-                r);
+                r.state);
 }
 
 }  // end namespace detail
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index b8e44580b5..da38464bf7 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -31,6 +31,7 @@ struct RsvdInputs {
   T tolerance;
   int n_row;
   int n_col;
+  float redundancy;
   T PC_perc;
   T UpS_perc;
   int k;
@@ -66,7 +67,7 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
     params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
     // rSVD seems to be very sensitive to the random number sequence as well!
-    raft::random::Rng r(params.seed, raft::random::GenTaps);
+    raft::random::Rng r(params.seed, raft::random::GenPC);
     int m = params.n_row, n = params.n_col;
     T eig_svd_tol  = 1.e-7;
     int max_sweeps = 100;
@@ -91,8 +92,19 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
       raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
       raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
 
-    } else {  // Other normal tests
-      r.normal(A.data(), m * n, mu, sigma, stream);
+    } else {                                 // Other normal tests
+      int n_informative   = int(0.25f * n);  // Informative cols
+      int len_informative = m * n_informative;
+
+      int n_redundant   = n - n_informative;  // Redundant cols
+      int len_redundant = m * n_redundant;
+
+      r.normal(A.data(), len_informative, mu, sigma, stream);
+      CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative,
+                                 A.data(),
+                                 len_redundant * sizeof(T),
+                                 cudaMemcpyDeviceToDevice,
+                                 stream));
     }
     std::vector<T> A_backup_cpu(m *
                                 n);  // Backup A matrix as svdJacobi will destroy the content of A
@@ -157,59 +169,65 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
 const std::vector<RsvdInputs<float>> inputs_fx = {
   // Test with ratios
-  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-  ,                                                         // Test with fixed ranks
-  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
-  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
-  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
-  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
-  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
-  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Tall + non-BBT
+
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                              // Test with fixed ranks
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Square + non-BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Tall + non-BBT
+
+  {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+
+  {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
 };
 
 const std::vector<RsvdInputs<double>> inputs_dx = {
   // Test with ratios
-  {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-  ,                                                      // Test with fixed ranks
-  {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
-  {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
-  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
-  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
-  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
-  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                             // Test with fixed ranks
+  {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
 };
 
 const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
-  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
+  {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
 
 const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
-  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
+  {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, false, 4321ULL}};
 
 typedef RsvdTest<float> RsvdSanityCheckValF;
 TEST_P(RsvdSanityCheckValF, Result)
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 8c7e440d0e..b1ce4b3236 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "test_utils.h"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/random/make_blobs.hpp>
 
-namespace raft::random {
+namespace raft {
+namespace random {
 
 template <typename T>
 __global__ void meanKernel(T* out,
@@ -136,8 +137,8 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
   {
     int len      = params.n_clusters * params.cols;
     auto compare = raft::CompareApprox<T>(num_sigma * params.tolerance);
-    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream));
-    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream));
+    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare));
+    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare));
   }
 
  protected:
@@ -153,53 +154,37 @@ typedef MakeBlobsTest<float> MakeBlobsTestF;
 const std::vector<MakeBlobsInputs<float>> inputsf_t = {
   {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
 };
 
 TEST_P(MakeBlobsTestF, Result) { check(); }
@@ -209,55 +194,40 @@ typedef MakeBlobsTest<double> MakeBlobsTestD;
 const std::vector<MakeBlobsInputs<double>> inputsd_t = {
   {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
 
   {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
 };
 TEST_P(MakeBlobsTestD, Result) { check(); }
 INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file

From 02341d60149d1eb83da28a748e564b7da76a1603 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Feb 2022 15:09:32 -0500
Subject: [PATCH 23/24] iFixing style

---
 cpp/test/random/make_blobs.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index b1ce4b3236..caad627d49 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "test_utils.h"
+#include "../test_utils.h"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>

From a3c59da08a8d01ca411aedcddfac26fe477783e5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 11 Feb 2022 16:18:31 -0500
Subject: [PATCH 24/24] Fixing docs

---
 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index a25fd9295c..cd9163096a 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -34,9 +34,11 @@ namespace knn {
  *                    matrix. Pass a nullptr if you don't need this info.
  * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
  * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  m      number of rows in x
+ * @param[in]  n      number of rows in y
+ * @param[in]  k      number of columns in x and k
  * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
  *                    squared as we compute L2-squared distance in this method)
- * @param[in]  fop    device lambda to do any other custom functions
  * @param[in]  stream cuda stream
  */
 template <typename DataT, typename IdxT>