From 478ddaca97a25e20f1f44f865693e507028f969b Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 21 Oct 2021 15:16:54 -0700
Subject: [PATCH 01/17] working through

---
 cpp/include/raft/linalg/add.cuh               | 19 +-----
 cpp/include/raft/linalg/binary_op.cuh         | 55 +++-------------
 cpp/include/raft/linalg/detail/add.cuh        | 47 ++++++++++++++
 cpp/include/raft/linalg/detail/binary_op.cuh  | 64 +++++++++++++++++++
 .../raft/linalg/detail/cholesky_r1_update.cuh |  0
 5 files changed, 123 insertions(+), 62 deletions(-)
 create mode 100644 cpp/include/raft/linalg/detail/add.cuh
 create mode 100644 cpp/include/raft/linalg/detail/binary_op.cuh
 create mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 7a454f64e2..9aa1e6e82a 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "detail/add.cuh"
+
 #include "binary_op.cuh"
 #include "unary_op.cuh"
 
@@ -63,16 +65,6 @@ void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
-template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                      const math_t *singleScalarDev,
-                                      IdxType len) {
-  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] + *singleScalarDev;
-  }
-}
-
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
@@ -86,12 +78,7 @@ template <typename math_t, typename IdxType = int>
 void addDevScalar(math_t *outDev, const math_t *inDev,
                   const math_t *singleScalarDev, IdxType len,
                   cudaStream_t stream) {
-  // TODO: block dimension has not been tuned
-  dim3 block(256);
-  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t>
-    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 940d786e87..eb2831b7e8 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -16,51 +16,14 @@
 
 #pragma once
 
+#include "detail/binary_op.cuh"
+
 #include <raft/cuda_utils.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType>
-__global__ void binaryOpKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len, Lambda op) {
-  typedef TxN_t<InType, VecLen> InVecType;
-  typedef TxN_t<OutType, VecLen> OutVecType;
-  InVecType a, b;
-  OutVecType c;
-  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  idx *= InVecType::Ratio;
-  if (idx >= len) return;
-  a.load(in1, idx);
-  b.load(in2, idx);
-#pragma unroll
-  for (int i = 0; i < InVecType::Ratio; ++i) {
-    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
-  }
-  c.store(out, idx);
-}
-
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType, int TPB>
-void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
-                  IdxType len, Lambda op, cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
-  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
-    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-/**
- * @brief Checks if addresses are aligned on N bytes
- */
-inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
-                           uint64_t N) {
-  return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
-}
-
 /**
  * @brief perform element-wise binary operation on the input arrays
  * @tparam InType input data-type
@@ -88,26 +51,26 @@ void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
   uint64_t in2Addr = uint64_t(in2);
   uint64_t outAddr = uint64_t(out);
   if (16 / maxSize && bytes % 16 == 0 &&
-      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
-    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+      detail::addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+        detail::binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (8 / maxSize && bytes % 8 == 0 &&
              addressAligned(in1Addr, in2Addr, outAddr, 8)) {
-    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+              detail::binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (4 / maxSize && bytes % 4 == 0 &&
              addressAligned(in1Addr, in2Addr, outAddr, 4)) {
-    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+              detail:: binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (2 / maxSize && bytes % 2 == 0 &&
              addressAligned(in1Addr, in2Addr, outAddr, 2)) {
-    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+              detail::binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (1 / maxSize) {
-    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
+    detail::binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else {
-    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
+    detail::binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
                                                            op, stream);
   }
 }
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
new file mode 100644
index 0000000000..550f1bcde3
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <class math_t, typename IdxType>
+__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
+                                    const math_t *singleScalarDev,
+                                    IdxType len) {
+IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+if (i < len) {
+    outDev[i] = inDev[i] + *singleScalarDev;
+}
+}
+
+template <typename math_t, typename IdxType = int>
+void addDevScalar(math_t *outDev, const math_t *inDev,
+                const math_t *singleScalarDev, IdxType len,
+                cudaStream_t stream) {
+// TODO: block dimension has not been tuned
+dim3 block(256);
+dim3 grid(raft::ceildiv(len, (IdxType)block.x));
+add_dev_scalar_kernel<math_t>
+    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+CUDA_CHECK(cudaPeekAtLastError());
+}
+
+} // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
new file mode 100644
index 0000000000..866bedf1ba
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #pragma once
+
+ namespace raft {
+ namespace linalg {
+ namespace detail {
+
+    template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType>
+__global__ void binaryOpKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len, Lambda op) {
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a, b;
+  OutVecType c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
+  }
+  c.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType, int TPB>
+void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
+                  IdxType len, Lambda op, cudaStream_t stream) {
+  const IdxType nblks =
+    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Checks if addresses are aligned on N bytes
+ */
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
+                           uint64_t N) {
+  return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
+}
+
+    } // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
new file mode 100644
index 0000000000..e69de29bb2

From d4b72ba275bd42b68a437d988b729c8d00432ce4 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 5 Nov 2021 15:41:27 -0700
Subject: [PATCH 02/17] working ththrough

---
 cpp/include/raft/linalg/add.cuh               |   9 +-
 cpp/include/raft/linalg/binary_op.cuh         |   1 -
 .../raft/linalg/cholesky_r1_update.cuh        |  89 +----
 .../raft/linalg/coalesced_reduction.cuh       |  54 +---
 cpp/include/raft/linalg/contractions.cuh      | 278 +---------------
 cpp/include/raft/linalg/detail/binary_op.cuh  |   2 +
 .../raft/linalg/detail/cholesky_r1_update.cuh |   0
 .../raft/linalg/detail/cholesky_r1_update.hpp | 119 +++++++
 .../linalg/detail/coalesced_reduction.cuh     |  89 +++++
 .../raft/linalg/detail/contractions.cuh       | 303 ++++++++++++++++++
 cpp/include/raft/linalg/detail/eig.hpp        | 167 ++++++++++
 cpp/include/raft/linalg/detail/functional.cuh |  80 +++++
 cpp/include/raft/linalg/divide.cuh            |   5 +-
 cpp/include/raft/linalg/eig.cuh               | 107 +------
 cpp/include/raft/linalg/eltwise.cuh           |  27 +-
 15 files changed, 794 insertions(+), 536 deletions(-)
 delete mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
 create mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
 create mode 100644 cpp/include/raft/linalg/detail/coalesced_reduction.cuh
 create mode 100644 cpp/include/raft/linalg/detail/contractions.cuh
 create mode 100644 cpp/include/raft/linalg/detail/eig.hpp
 create mode 100644 cpp/include/raft/linalg/detail/functional.cuh

diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 9aa1e6e82a..de614185c0 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "detail/add.cuh"
+#include "detail/functional.cuh"
 
 #include "binary_op.cuh"
 #include "unary_op.cuh"
@@ -24,6 +25,8 @@
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @brief Elementwise scalar add operation on the input buffer
  *
@@ -41,8 +44,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
                cudaStream_t stream) {
-  auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
 }
 
 /**
@@ -61,8 +63,7 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
          cudaStream_t stream) {
-  auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+  binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index eb2831b7e8..56dc9cf623 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -19,7 +19,6 @@
 #include "detail/binary_op.cuh"
 
 #include <raft/cuda_utils.cuh>
-#include <raft/vectorized.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index d6d064c20e..e72d3e963f 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -16,11 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
+#include "detail/cholesky_r1_update.hpp"
 
 namespace raft {
 namespace linalg {
@@ -125,88 +121,7 @@ template <typename math_t>
 void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
                          void *workspace, int *n_bytes, cublasFillMode_t uplo,
                          cudaStream_t stream, math_t eps = -1) {
-  // The matrix A' is defined as:
-  // A' = [[A_11, A_12]
-  //       [A_21, A_22]]
-  // where:
-  // - A_11 = A, matrix of size (n-1)x(n-1)
-  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
-  // - A_22 = A_new[n-1] scalar.
-  //
-  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
-  // we just update L with the new row. The new Cholesky decomposition will be
-  // calculated as:
-  // L' = [[L_11,    0]
-  //       [L_12, L_22]]
-  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
-  // L_12 and L_22 are the new quantities that we need to calculate.
-
-  // We need a workspace in device memory to store a scalar. Additionally, in
-  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
-  const int align = 256;
-  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
-                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
-                 : 0;
-  if (workspace == nullptr) {
-    *n_bytes = offset + 1 * sizeof(math_t);
-    return;
-  }
-  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
-  math_t *L_22 = L + (n - 1) * ld + n - 1;
-
-  math_t *A_new;
-  math_t *A_row;
-  if (uplo == CUBLAS_FILL_MODE_UPPER) {
-    // A_new is stored as the n-1 th column of L
-    A_new = L + (n - 1) * ld;
-  } else {
-    // If the input is lower triangular, then the new elements of A are stored
-    // as the n-th row of L. Since the matrix is column major, this is non
-    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
-    A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t *>(workspace);
-    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                          A_row, ld, A_new, 1, stream));
-  }
-  cublasOperation_t op =
-    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  if (n > 1) {
-    // Calculate L_12 = x by solving equation L_11 x = A_12
-    math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(
-      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
-      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
-
-    // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
-                                         A_new, 1, A_new, 1, s, stream));
-
-    if (uplo == CUBLAS_FILL_MODE_LOWER) {
-      // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                            A_new, 1, A_row, ld, stream));
-    }
-  } else {  // n == 1 case
-    CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
-  }
-
-  // L_22 = sqrt(A_22 - L_12 * L_12)
-  math_t s_host;
-  math_t L_22_host;
-  raft::update_host(&s_host, s, 1, stream);
-  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  L_22_host = std::sqrt(L_22_host - s_host);
-
-  // Check for numeric error with sqrt. If the matrix is not positive definit or
-  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
-  // negative, which would result L_22 = NaN. A small positive eps parameter
-  // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
-    L_22_host = eps;
-  }
-  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
-  raft::update_device(L_22, &L_22_host, 1, stream);
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
 }
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index ef983ff3d0..d11489bd7e 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -16,42 +16,11 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
-#include <raft/cuda_utils.cuh>
+#include "detail/coalesced_reduction.cuh"
 
 namespace raft {
 namespace linalg {
 
-// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
-// of the matrix, i.e. reduce along rows for row major or reduce along columns
-// for column major layout. Kernel does an inplace reduction adding to original
-// values of dots.
-template <typename InType, typename OutType, typename IdxType, int TPB,
-          typename MainLambda, typename ReduceLambda, typename FinalLambda>
-__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
-                                         int D, int N, OutType init,
-                                         MainLambda main_op,
-                                         ReduceLambda reduce_op,
-                                         FinalLambda final_op,
-                                         bool inplace = false) {
-  typedef cub::BlockReduce<OutType, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  OutType thread_data = init;
-  IdxType rowStart = blockIdx.x * D;
-  for (IdxType i = threadIdx.x; i < D; i += TPB) {
-    IdxType idx = rowStart + i;
-    thread_data = reduce_op(thread_data, main_op(data[idx], i));
-  }
-  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
-  if (threadIdx.x == 0) {
-    if (inplace) {
-      dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc));
-    } else {
-      dots[blockIdx.x] = final_op(acc);
-    }
-  }
-}
-
 /**
  * @brief Compute reduction of the input matrix along the leading dimension
  *
@@ -88,26 +57,7 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N,
                         MainLambda main_op = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
                         FinalLambda final_op = raft::Nop<OutType>()) {
-  // One block per reduction
-  // Efficient only for large leading dimensions
-  if (D <= 32) {
-    coalescedReductionKernel<InType, OutType, IdxType, 32>
-      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
-  } else if (D <= 64) {
-    coalescedReductionKernel<InType, OutType, IdxType, 64>
-      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
-  } else if (D <= 128) {
-    coalescedReductionKernel<InType, OutType, IdxType, 128>
-      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
-  } else {
-    coalescedReductionKernel<InType, OutType, IdxType, 256>
-      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op)
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index e6ff8a49ce..107c8ac3c2 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/common/device_loads_stores.cuh>
+#include "detail/contractions.cuh"
 
 namespace raft {
 namespace linalg {
@@ -201,281 +201,7 @@ struct Policy2x8<double, _veclen> {
  * @tparam Policy policy used to customize memory access behavior.
  *                See documentation for `KernelPolicy` to know more.
  */
-template <typename DataT, typename IdxT, typename Policy,
-          bool isRowMajor = true>
-struct Contractions_NT {
- protected:
-  typedef Policy P;
-
-  /** number of rows in X */
-  IdxT m;
-  /** number of rows in Y */
-  IdxT n;
-  /** number of columns in X and Y */
-  IdxT k;
-  /** leading dimension in X */
-  IdxT lda;
-  /** leading dimension in Y */
-  IdxT ldb;
-  /** leading dimension in Output D */
-  IdxT ldd;
-
-  /** current thread's global mem row id for X data */
-  IdxT xrowid;
-  /** current thread's global mem row id for Y data */
-  IdxT yrowid;
-  /** global memory pointer to X matrix */
-  const DataT* x;
-  /** global memory pointer to Y matrix */
-  const DataT* y;
-
-  /** current thread's smem row id */
-  int srowid;
-  /** current thread's smem column id */
-  int scolid;
-  /** current thread's accumulation row id */
-  int accrowid;
-  /** current thread's accumulation column id */
-  int acccolid;
-
-  /** base smem pointer for X data storage */
-  DataT* sx;
-  /** base smem pointer for Y data storage */
-  DataT* sy;
-  /** index pointing the correct smem page for writing after `ldgXY()` */
-  int pageWr;
-  /** index pointing the correct smem page for reading during `ldsXY()` */
-  int pageRd;
-
-  /** block of X data loaded from smem after `ldsXY()` */
-  DataT regx[P::AccRowsPerTh][P::Veclen];
-  /** block of Y data loaded from smem after `ldsXY()` */
-  DataT regy[P::AccColsPerTh][P::Veclen];
-  /** block of X data loaded from global mem after `ldgXY()` */
-  DataT ldgDataX[P::LdgPerThX][P::Veclen];
-  /** block of Y data loaded from global mem after `ldgXY()` */
-  DataT ldgDataY[P::LdgPerThY][P::Veclen];
-
-  static const DataT Zero = (DataT)0;
-
- public:
-  /**
-   * @brief Ctor
-   * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
-   * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
-   * @param[in] _m number of rows of X
-   * @param[in] _n number of rows of Y
-   * @param[in] _k number of cols of X and Y
-   * @param[in] _smem shared memory region used during computations
-   */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, char* _smem)
-    : m(_m),
-      n(_n),
-      k(_k),
-      lda(_k),
-      ldb(_k),
-      xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow),
-      yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow),
-      x(_x + xrowid * lda),
-      y(_y + yrowid * ldb),
-      srowid(threadIdx.x / P::LdgThRow),
-      scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
-      accrowid(threadIdx.x / P::AccThCols),
-      acccolid(threadIdx.x % P::AccThCols),
-      sx((DataT*)_smem),
-      sy(&(sx[P::SmemPageX])),
-      pageWr(0),
-      pageRd(0) {}
-
-  /**
-   * @brief Ctor
-   * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
-   * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
-   * @param[in] _m number of rows of X
-   * @param[in] _n number of rows of Y
-   * @param[in] _k number of cols of X and Y
-   * @param[in] _smem shared memory region used during computations
-   */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
-    : m(_m),
-      n(_n),
-      k(_k),
-      lda(_lda),
-      ldb(_ldb),
-      ldd(_ldd),
-      srowid(threadIdx.x / P::LdgThRow),
-      scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
-      accrowid(threadIdx.x / P::AccThCols),
-      acccolid(threadIdx.x % P::AccThCols),
-      sx((DataT*)_smem),
-      sy(&(sx[P::SmemPageX])),
-      pageWr(0),
-      pageRd(0) {
-    if (isRowMajor) {
-      xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
-      yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x = _x + xrowid * lda;
-      y = _y + yrowid * ldb;
-    } else {
-      xrowid = IdxT(blockIdx.y) * P::Mblk;
-      yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x = _x + xrowid + srowid * lda;
-      y = _y + yrowid + srowid * ldb;
-    }
-  }
-
- protected:
-  /**
-   * @brief Load current block of X/Y from global memory to registers
-   * @param[in] kidx current start index of k to be loaded
-   */
-  DI void ldgXY(IdxT kidx) {
-    ldgX(kidx);
-    ldgY(kidx);
-  }
-
-  /**
-   * @brief Store current block of X/Y from registers to smem
-   * @param[in] kidx current start index of k to be loaded
-   */
-  DI void stsXY() {
-    stsX(sx + pageWr * P::SmemPage);
-    stsY(sy + pageWr * P::SmemPage);
-  }
-
-  /**
-   * @brief Load X and Y block from shared memory to registers
-   * @param[in] kidx k value from the current k-block to be loaded from smem
-   */
-  DI void ldsXY(int kidx) {
-    ldsX(kidx, sx + pageRd * P::SmemPage);
-    ldsY(kidx, sy + pageRd * P::SmemPage);
-  }
-
- private:
-  DI void ldgX(IdxT kidx) {
-    if (isRowMajor) {
-      auto numRows = m;
-      auto koffset = kidx + scolid;
-#pragma unroll
-      for (int i = 0; i < P::LdgPerThX; ++i) {
-        if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) {
-          ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset);
-        } else {
-#pragma unroll
-          for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataX[i][j] = Zero;
-          }
-        }
-      }
-    } else {
-      const auto numRows = k;
-      auto koffset = scolid;
-#pragma unroll
-      for (int i = 0; i < P::LdgPerThX; ++i) {
-        if ((koffset + xrowid) < lda &&
-            (srowid + kidx + i * P::LdgRowsX) < numRows) {
-          ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
-        } else {
-#pragma unroll
-          for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataX[i][j] = Zero;
-          }
-        }
-      }
-    }
-  }
-
-  DI void ldgY(IdxT kidx) {
-    if (isRowMajor) {
-      auto numRows = n;
-      auto koffset = kidx + scolid;
-#pragma unroll
-      for (int i = 0; i < P::LdgPerThY; ++i) {
-        if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) {
-          ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset);
-        } else {
-#pragma unroll
-          for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataY[i][j] = Zero;
-          }
-        }
-      }
-    } else {
-      auto numRows = k;
-      auto koffset = scolid;
-#pragma unroll
-      for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb &&
-            (srowid + kidx + i * P::LdgRowsY) < numRows) {
-          ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
-        } else {
-#pragma unroll
-          for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataY[i][j] = Zero;
-          }
-        }
-      }
-    }
-  }
-
-  DI void stsX(DataT* smem) {
-    auto* saddr = smem + srowid * P::SmemStride + scolid;
-#pragma unroll
-    for (int i = 0; i < P::LdgPerThX; ++i) {
-      sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]);
-    }
-  }
-
-  DI void stsY(DataT* smem) {
-    auto* saddr = smem + srowid * P::SmemStride + scolid;
-#pragma unroll
-    for (int i = 0; i < P::LdgPerThY; ++i) {
-      sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]);
-    }
-  }
-
-  DI void ldsX(int kidx, DataT* smem) {
-    if (isRowMajor) {
-      auto* saddr = smem + accrowid * P::SmemStride + kidx;
-#pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        lds(regx[i], saddr + i * P::AccThRows * P::SmemStride);
-      }
-    } else {
-      auto* saddr = smem + accrowid + kidx * P::SmemStride;
-#pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int v = 0; v < P::Veclen; ++v) {
-          regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride];
-        }
-      }
-    }
-  }
-
-  DI void ldsY(int kidx, DataT* smem) {
-    if (isRowMajor) {
-      auto* saddr = smem + acccolid * P::SmemStride + kidx;
-#pragma unroll
-      for (int i = 0; i < P::AccColsPerTh; ++i) {
-        lds(regy[i], saddr + i * P::AccThCols * P::SmemStride);
-      }
-    } else {
-      auto* saddr = smem + acccolid + kidx * P::SmemStride;
-#pragma unroll
-      for (int i = 0; i < P::AccColsPerTh; ++i) {
-#pragma unroll
-        for (int v = 0; v < P::Veclen; ++v) {
-          regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride];
-        }
-      }
-    }
-  }
-
-};  // struct Contractions_NT
+using detail::Contractions_NT;
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
index 866bedf1ba..89876afe46 100644
--- a/cpp/include/raft/linalg/detail/binary_op.cuh
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -16,6 +16,8 @@
 
  #pragma once
 
+ #include <raft/vectorized.cuh>
+
  namespace raft {
  namespace linalg {
  namespace detail {
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
new file mode 100644
index 0000000000..b5f81579a6
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/binary_op.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
+                        void *workspace, int *n_bytes, cublasFillMode_t uplo,
+                        cudaStream_t stream, math_t eps = -1) {
+  // The matrix A' is defined as:
+  // A' = [[A_11, A_12]
+  //       [A_21, A_22]]
+  // where:
+  // - A_11 = A, matrix of size (n-1)x(n-1)
+  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
+  // - A_22 = A_new[n-1] scalar.
+  //
+  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
+  // we just update L with the new row. The new Cholesky decomposition will be
+  // calculated as:
+  // L' = [[L_11,    0]
+  //       [L_12, L_22]]
+  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
+  // L_12 and L_22 are the new quantities that we need to calculate.
+
+  // We need a workspace in device memory to store a scalar. Additionally, in
+  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
+  const int align = 256;
+  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
+                ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
+                : 0;
+  if (workspace == nullptr) {
+    *n_bytes = offset + 1 * sizeof(math_t);
+    return;
+  }
+  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
+  math_t *L_22 = L + (n - 1) * ld + n - 1;
+
+  math_t *A_new;
+  math_t *A_row;
+  if (uplo == CUBLAS_FILL_MODE_UPPER) {
+    // A_new is stored as the n-1 th column of L
+    A_new = L + (n - 1) * ld;
+  } else {
+    // If the input is lower triangular, then the new elements of A are stored
+    // as the n-th row of L. Since the matrix is column major, this is non
+    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
+    A_row = L + n - 1;
+    A_new = reinterpret_cast<math_t *>(workspace);
+    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
+                                          A_row, ld, A_new, 1, stream));
+  }
+  cublasOperation_t op =
+    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  if (n > 1) {
+    // Calculate L_12 = x by solving equation L_11 x = A_12
+    math_t alpha = 1;
+    CUBLAS_CHECK(raft::linalg::cublastrsm(
+      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
+      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
+
+    // A_new now stores L_12, we calculate s = L_12 * L_12
+    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
+                                        A_new, 1, A_new, 1, s, stream));
+
+    if (uplo == CUBLAS_FILL_MODE_LOWER) {
+      // Copy back the L_12 elements as the n-th row of L
+      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
+                                            A_new, 1, A_row, ld, stream));
+    }
+  } else {  // n == 1 case
+    CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
+  }
+
+  // L_22 = sqrt(A_22 - L_12 * L_12)
+  math_t s_host;
+  math_t L_22_host;
+  raft::update_host(&s_host, s, 1, stream);
+  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  L_22_host = std::sqrt(L_22_host - s_host);
+
+  // Check for numeric error with sqrt. If the matrix is not positive definit or
+  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
+  // negative, which would result L_22 = NaN. A small positive eps parameter
+  // can be used to prevent this.
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
+    L_22_host = eps;
+  }
+  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
+  raft::update_device(L_22, &L_22_host, 1, stream);
+}
+
+} // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
new file mode 100644
index 0000000000..f9c5223bdc
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+
+ namespace raft {
+ namespace linalg {
+ namespace detail {
+
+// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
+// of the matrix, i.e. reduce along rows for row major or reduce along columns
+// for column major layout. Kernel does an inplace reduction adding to original
+// values of dots.
+template <typename InType, typename OutType, typename IdxType, int TPB,
+          typename MainLambda, typename ReduceLambda, typename FinalLambda>
+__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
+                                         int D, int N, OutType init,
+                                         MainLambda main_op,
+                                         ReduceLambda reduce_op,
+                                         FinalLambda final_op,
+                                         bool inplace = false) {
+  typedef cub::BlockReduce<OutType, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType thread_data = init;
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    thread_data = reduce_op(thread_data, main_op(data[idx], i));
+  }
+  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc));
+    } else {
+      dots[blockIdx.x] = final_op(acc);
+    }
+  }
+}
+
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda = raft::Nop<OutType>>
+void coalescedReduction(OutType *dots, const InType *data, int D, int N,
+                        OutType init, cudaStream_t stream, bool inplace = false,
+                        MainLambda main_op = raft::Nop<InType, IdxType>(),
+                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                        FinalLambda final_op = raft::Nop<OutType>()) {
+  // One block per reduction
+  // Efficient only for large leading dimensions
+  if (D <= 32) {
+    coalescedReductionKernel<InType, OutType, IdxType, 32>
+      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                             final_op, inplace);
+  } else if (D <= 64) {
+    coalescedReductionKernel<InType, OutType, IdxType, 64>
+      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                             final_op, inplace);
+  } else if (D <= 128) {
+    coalescedReductionKernel<InType, OutType, IdxType, 128>
+      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                              final_op, inplace);
+  } else {
+    coalescedReductionKernel<InType, OutType, IdxType, 256>
+      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                              final_op, inplace);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+    } // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
new file mode 100644
index 0000000000..2e4657ebc3
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #pragma once
+
+ #include <raft/common/device_loads_stores.cuh>
+
+ namespace raft {
+ namespace linalg {
+ namespace detail {
+
+    template <typename DataT, typename IdxT, typename Policy,
+    bool isRowMajor = true>
+struct Contractions_NT {
+protected:
+typedef Policy P;
+
+/** number of rows in X */
+IdxT m;
+/** number of rows in Y */
+IdxT n;
+/** number of columns in X and Y */
+IdxT k;
+/** leading dimension in X */
+IdxT lda;
+/** leading dimension in Y */
+IdxT ldb;
+/** leading dimension in Output D */
+IdxT ldd;
+
+/** current thread's global mem row id for X data */
+IdxT xrowid;
+/** current thread's global mem row id for Y data */
+IdxT yrowid;
+/** global memory pointer to X matrix */
+const DataT* x;
+/** global memory pointer to Y matrix */
+const DataT* y;
+
+/** current thread's smem row id */
+int srowid;
+/** current thread's smem column id */
+int scolid;
+/** current thread's accumulation row id */
+int accrowid;
+/** current thread's accumulation column id */
+int acccolid;
+
+/** base smem pointer for X data storage */
+DataT* sx;
+/** base smem pointer for Y data storage */
+DataT* sy;
+/** index pointing the correct smem page for writing after `ldgXY()` */
+int pageWr;
+/** index pointing the correct smem page for reading during `ldsXY()` */
+int pageRd;
+
+/** block of X data loaded from smem after `ldsXY()` */
+DataT regx[P::AccRowsPerTh][P::Veclen];
+/** block of Y data loaded from smem after `ldsXY()` */
+DataT regy[P::AccColsPerTh][P::Veclen];
+/** block of X data loaded from global mem after `ldgXY()` */
+DataT ldgDataX[P::LdgPerThX][P::Veclen];
+/** block of Y data loaded from global mem after `ldgXY()` */
+DataT ldgDataY[P::LdgPerThY][P::Veclen];
+
+static const DataT Zero = (DataT)0;
+
+public:
+/**
+* @brief Ctor
+* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
+* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
+* @param[in] _m number of rows of X
+* @param[in] _n number of rows of Y
+* @param[in] _k number of cols of X and Y
+* @param[in] _smem shared memory region used during computations
+*/
+DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+               IdxT _k, char* _smem)
+: m(_m),
+n(_n),
+k(_k),
+lda(_k),
+ldb(_k),
+xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow),
+yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow),
+x(_x + xrowid * lda),
+y(_y + yrowid * ldb),
+srowid(threadIdx.x / P::LdgThRow),
+scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
+accrowid(threadIdx.x / P::AccThCols),
+acccolid(threadIdx.x % P::AccThCols),
+sx((DataT*)_smem),
+sy(&(sx[P::SmemPageX])),
+pageWr(0),
+pageRd(0) {}
+
+/**
+* @brief Ctor
+* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
+* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
+* @param[in] _m number of rows of X
+* @param[in] _n number of rows of Y
+* @param[in] _k number of cols of X and Y
+* @param[in] _smem shared memory region used during computations
+*/
+DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+               IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
+: m(_m),
+n(_n),
+k(_k),
+lda(_lda),
+ldb(_ldb),
+ldd(_ldd),
+srowid(threadIdx.x / P::LdgThRow),
+scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
+accrowid(threadIdx.x / P::AccThCols),
+acccolid(threadIdx.x % P::AccThCols),
+sx((DataT*)_smem),
+sy(&(sx[P::SmemPageX])),
+pageWr(0),
+pageRd(0) {
+if (isRowMajor) {
+xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
+yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
+x = _x + xrowid * lda;
+y = _y + yrowid * ldb;
+} else {
+xrowid = IdxT(blockIdx.y) * P::Mblk;
+yrowid = IdxT(blockIdx.x) * P::Nblk;
+x = _x + xrowid + srowid * lda;
+y = _y + yrowid + srowid * ldb;
+}
+}
+
+protected:
+/**
+* @brief Load current block of X/Y from global memory to registers
+* @param[in] kidx current start index of k to be loaded
+*/
+DI void ldgXY(IdxT kidx) {
+ldgX(kidx);
+ldgY(kidx);
+}
+
+/**
+* @brief Store current block of X/Y from registers to smem
+* @param[in] kidx current start index of k to be loaded
+*/
+DI void stsXY() {
+stsX(sx + pageWr * P::SmemPage);
+stsY(sy + pageWr * P::SmemPage);
+}
+
+/**
+* @brief Load X and Y block from shared memory to registers
+* @param[in] kidx k value from the current k-block to be loaded from smem
+*/
+DI void ldsXY(int kidx) {
+ldsX(kidx, sx + pageRd * P::SmemPage);
+ldsY(kidx, sy + pageRd * P::SmemPage);
+}
+
+private:
+DI void ldgX(IdxT kidx) {
+if (isRowMajor) {
+auto numRows = m;
+auto koffset = kidx + scolid;
+#pragma unroll
+for (int i = 0; i < P::LdgPerThX; ++i) {
+  if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) {
+    ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset);
+  } else {
+#pragma unroll
+    for (int j = 0; j < P::Veclen; ++j) {
+      ldgDataX[i][j] = Zero;
+    }
+  }
+}
+} else {
+const auto numRows = k;
+auto koffset = scolid;
+#pragma unroll
+for (int i = 0; i < P::LdgPerThX; ++i) {
+  if ((koffset + xrowid) < lda &&
+      (srowid + kidx + i * P::LdgRowsX) < numRows) {
+    ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
+  } else {
+#pragma unroll
+    for (int j = 0; j < P::Veclen; ++j) {
+      ldgDataX[i][j] = Zero;
+    }
+  }
+}
+}
+}
+
+DI void ldgY(IdxT kidx) {
+if (isRowMajor) {
+auto numRows = n;
+auto koffset = kidx + scolid;
+#pragma unroll
+for (int i = 0; i < P::LdgPerThY; ++i) {
+  if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) {
+    ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset);
+  } else {
+#pragma unroll
+    for (int j = 0; j < P::Veclen; ++j) {
+      ldgDataY[i][j] = Zero;
+    }
+  }
+}
+} else {
+auto numRows = k;
+auto koffset = scolid;
+#pragma unroll
+for (int i = 0; i < P::LdgPerThY; ++i) {
+  if ((koffset + yrowid) < ldb &&
+      (srowid + kidx + i * P::LdgRowsY) < numRows) {
+    ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
+  } else {
+#pragma unroll
+    for (int j = 0; j < P::Veclen; ++j) {
+      ldgDataY[i][j] = Zero;
+    }
+  }
+}
+}
+}
+
+DI void stsX(DataT* smem) {
+auto* saddr = smem + srowid * P::SmemStride + scolid;
+#pragma unroll
+for (int i = 0; i < P::LdgPerThX; ++i) {
+sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]);
+}
+}
+
+DI void stsY(DataT* smem) {
+auto* saddr = smem + srowid * P::SmemStride + scolid;
+#pragma unroll
+for (int i = 0; i < P::LdgPerThY; ++i) {
+sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]);
+}
+}
+
+DI void ldsX(int kidx, DataT* smem) {
+if (isRowMajor) {
+auto* saddr = smem + accrowid * P::SmemStride + kidx;
+#pragma unroll
+for (int i = 0; i < P::AccRowsPerTh; ++i) {
+  lds(regx[i], saddr + i * P::AccThRows * P::SmemStride);
+}
+} else {
+auto* saddr = smem + accrowid + kidx * P::SmemStride;
+#pragma unroll
+for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+  for (int v = 0; v < P::Veclen; ++v) {
+    regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride];
+  }
+}
+}
+}
+
+DI void ldsY(int kidx, DataT* smem) {
+if (isRowMajor) {
+auto* saddr = smem + acccolid * P::SmemStride + kidx;
+#pragma unroll
+for (int i = 0; i < P::AccColsPerTh; ++i) {
+  lds(regy[i], saddr + i * P::AccThCols * P::SmemStride);
+}
+} else {
+auto* saddr = smem + acccolid + kidx * P::SmemStride;
+#pragma unroll
+for (int i = 0; i < P::AccColsPerTh; ++i) {
+#pragma unroll
+  for (int v = 0; v < P::Veclen; ++v) {
+    regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride];
+  }
+}
+}
+}
+
+};  // struct Contractions_NT
+
+} // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
new file mode 100644
index 0000000000..71cea65c3c
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
+           int n_cols, math_t *eig_vectors, math_t *eig_vals,
+           cudaStream_t stream) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int lwork;
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
+                                            n_cols, eig_vals, &lwork));
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
+
+  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
+
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 d_dev_info.data(), stream));
+  CUDA_CHECK(cudaGetLastError());
+
+  int dev_info = d_dev_info.value(stream);
+  ASSERT(dev_info == 0,
+         "eig.cuh: eigensolver couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+}
+
+enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
+
+#if CUDART_VERSION >= 10010
+
+/**
+ * @defgroup eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param n_eig_vals: number of eigenvectors to be generated
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
+              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
+              EigVecMemUsage memUsage, cudaStream_t stream) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int lwork;
+  int h_meig;
+
+  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
+    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
+  rmm::device_uvector<math_t> d_eig_vectors(0, stream);
+
+  if (memUsage == OVERWRITE_INPUT) {
+    CUSOLVER_CHECK(cusolverDnsyevdx(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
+      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
+      d_dev_info.data(), stream));
+  } else if (memUsage == COPY_INPUT) {
+    d_eig_vectors.resize(n_rows * n_cols, stream);
+    raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
+
+    CUSOLVER_CHECK(cusolverDnsyevdx(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
+      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
+      d_work.data(), lwork, d_dev_info.data(), stream));
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+
+  int dev_info = d_dev_info.value(stream);
+  ASSERT(dev_info == 0,
+         "eig.cuh: eigensolver couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+
+  if (memUsage == OVERWRITE_INPUT) {
+    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
+                                  stream);
+  } else if (memUsage == COPY_INPUT) {
+    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
+                                  n_rows, n_eig_vals, stream);
+  }
+}
+
+#endif
+
+template <typename math_t>
+void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
+               int n_cols, math_t *eig_vectors, math_t *eig_vals,
+               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  syevjInfo_t syevj_params = nullptr;
+  CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
+  CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
+  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
+
+  int lwork;
+  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
+    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> dev_info(stream);
+
+  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
+
+  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 dev_info.data(), syevj_params, stream));
+
+  int executed_sweeps;
+  CUSOLVER_CHECK(
+    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+
+  CUDA_CHECK(cudaGetLastError());
+  CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
+}
+
+} // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
new file mode 100644
index 0000000000..275e5f5917
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/functional.cuh
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #pragma once
+
+ #include <thrust/functional.h>
+
+ namespace raft {
+ namespace linalg {
+ namespace detail {
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct divides_scalar {
+
+public:
+    divides_scalar(ArgType scalar) : scalar_(scalar) {} 
+
+    __host__ __device__ inline ReturnType operator()(ArgType in) {
+        return in / scalar_;
+    }
+
+private:
+    ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct adds_scalar {
+
+public:
+    adds_scalar(ArgType scalar) : scalar_(scalar) {} 
+
+    __host__ __device__ inline ReturnType operator()(ArgType in) {
+        return in + scalar_;
+    }
+
+private:
+    ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct multiplies_scalar {
+
+public:
+    multiplies_scalar(ArgType scalar) : scalar_(scalar) {} 
+
+    __host__ __device__ inline ReturnType operator()(ArgType in) {
+        return in * scalar_;
+    }
+
+private:
+    ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct divides_check_zero {
+
+public:
+    __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) {
+        return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
+    }
+
+};
+
+
+} // namespace detail
+} // namespace linalg
+} // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index c848ac1f4b..a2d8d67c30 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -16,11 +16,14 @@
 
 #pragma once
 
+#include "detail/functional.cuh"
 #include "unary_op.cuh"
 
 namespace raft {
 namespace linalg {
 
+using detail::divides_scalar;
+
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam math_t data-type upon which the math operation will be performed
@@ -36,7 +39,7 @@ template <typename math_t, typename IdxType = int>
 void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
                   cudaStream_t stream) {
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; },
+    out, in, len, divides_scalar<math_t>(scalar),
     stream);
 }
 /** @} */
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 5b2df3bcb3..296f916469 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -16,14 +16,7 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
-#include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/matrix/matrix.cuh>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
+#include "detail/eig.hpp"
 
 namespace raft {
 namespace linalg {
@@ -45,28 +38,7 @@ template <typename math_t>
 void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
            int n_cols, math_t *eig_vectors, math_t *eig_vals,
            cudaStream_t stream) {
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-
-  int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
-                                            n_cols, eig_vals, &lwork));
-
-  rmm::device_uvector<math_t> d_work(lwork, stream);
-  rmm::device_scalar<int> d_dev_info(stream);
-
-  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
-
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 d_dev_info.data(), stream));
-  CUDA_CHECK(cudaGetLastError());
-
-  int dev_info = d_dev_info.value(stream);
-  ASSERT(dev_info == 0,
-         "eig.cuh: eigensolver couldn't converge to a solution. "
-         "This usually occurs when some of the features do not vary enough.");
+  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
 }
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
@@ -91,51 +63,7 @@ template <typename math_t>
 void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
               int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
               EigVecMemUsage memUsage, cudaStream_t stream) {
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-
-  int lwork;
-  int h_meig;
-
-  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
-
-  rmm::device_uvector<math_t> d_work(lwork, stream);
-  rmm::device_scalar<int> d_dev_info(stream);
-  rmm::device_uvector<math_t> d_eig_vectors(0, stream);
-
-  if (memUsage == OVERWRITE_INPUT) {
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
-      d_dev_info.data(), stream));
-  } else if (memUsage == COPY_INPUT) {
-    d_eig_vectors.resize(n_rows * n_cols, stream);
-    raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
-
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
-      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
-      d_work.data(), lwork, d_dev_info.data(), stream));
-  }
-
-  CUDA_CHECK(cudaGetLastError());
-
-  int dev_info = d_dev_info.value(stream);
-  ASSERT(dev_info == 0,
-         "eig.cuh: eigensolver couldn't converge to a solution. "
-         "This usually occurs when some of the features do not vary enough.");
-
-  if (memUsage == OVERWRITE_INPUT) {
-    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
-                                  stream);
-  } else if (memUsage == COPY_INPUT) {
-    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
-                                  n_rows, n_eig_vals, stream);
-  }
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
 }
 
 #endif
@@ -158,34 +86,7 @@ template <typename math_t>
 void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
                int n_cols, math_t *eig_vectors, math_t *eig_vals,
                cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-
-  syevjInfo_t syevj_params = nullptr;
-  CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
-
-  int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
-    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
-
-  rmm::device_uvector<math_t> d_work(lwork, stream);
-  rmm::device_scalar<int> dev_info(stream);
-
-  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
-
-  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 dev_info.data(), syevj_params, stream));
-
-  int executed_sweeps;
-  CUSOLVER_CHECK(
-    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
-
-  CUDA_CHECK(cudaGetLastError());
-  CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 1c6dee562d..b7c9619c4e 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -16,12 +16,16 @@
 
 #pragma once
 
+#include "detail/functional.cuh"
+
 #include "binary_op.cuh"
 #include "unary_op.cuh"
 
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam InType data-type upon which the math operation will be performed
@@ -37,15 +41,17 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
                cudaStream_t stream) {
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; },
+    out, in, len, adds_scalar<in, out>(scalar),
     stream);
 }
 
+using detail::multiplies_scalar;
+
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
                     cudaStream_t stream) {
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; },
+    out, in, len, multiplies_scalar<in, out>(scalar),
     stream);
 }
 /** @} */
@@ -65,7 +71,7 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
                 cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
+    out, in1, in2, len, thrust::plus<InType>(),
     stream);
 }
 
@@ -73,7 +79,7 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
                 cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; },
+    out, in1, in2, len, thrust::minus<InType>(),
     stream);
 }
 
@@ -81,7 +87,7 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
                      IdxType len, cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; },
+    out, in1, in2, len, thrust::multiplies<InType>(),
     stream);
 }
 
@@ -89,21 +95,18 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
                    IdxType len, cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; },
+    out, in1, in2, len, thrust::divides<InType>(),
     stream);
 }
 
+using detail::divides_check_zero;
+
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
                             IdxType len, cudaStream_t stream) {
   binaryOp(
     out, in1, in2, len,
-    [] __device__(InType a, InType b) {
-      if (b == InType(0.0))
-        return InType(0.0);
-      else
-        return a / b;
-    },
+    divides_check_zero<InType, OutType>(),
     stream);
 }
 /** @} */

From 788ffa864f501efee64904d833a9c9656e2dae57 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 16 Nov 2021 18:00:54 -0800
Subject: [PATCH 03/17] style fix

---
 .../raft/linalg/cholesky_r1_update.hpp        |   3 +-
 .../raft/linalg/coalesced_reduction.hpp       |   3 +-
 cpp/include/raft/linalg/detail/add.cuh        |  30 +-
 cpp/include/raft/linalg/detail/binary_op.cuh  |  30 +-
 .../raft/linalg/detail/cholesky_r1_update.hpp |  16 +-
 .../linalg/detail/coalesced_reduction.cuh     |  14 +-
 .../raft/linalg/detail/contractions.cuh       | 432 +++++++++---------
 cpp/include/raft/linalg/detail/eig.hpp        |   6 +-
 cpp/include/raft/linalg/detail/functional.cuh |  72 ++-
 cpp/include/raft/linalg/detail/gemm.hpp       |   6 +-
 cpp/include/raft/linalg/detail/lanczos.hpp    |   2 +-
 cpp/include/raft/linalg/detail/map.cuh        |  69 ++-
 .../raft/linalg/detail/map_then_reduce.cuh    |  48 +-
 .../raft/linalg/detail/matrix_vector_op.cuh   | 355 +++++++-------
 cpp/include/raft/linalg/detail/qr.cuh         | 183 ++++----
 .../raft/linalg/detail/strided_reduction.cuh  | 249 +++++-----
 cpp/include/raft/linalg/detail/subtract.cuh   |  71 ++-
 cpp/include/raft/linalg/detail/svd.cuh        | 316 ++++++-------
 cpp/include/raft/linalg/detail/unary_op.cuh   | 193 ++++----
 cpp/include/raft/linalg/divide.hpp            |   4 +-
 cpp/include/raft/linalg/eig.hpp               |   8 +-
 cpp/include/raft/linalg/eltwise.hpp           |  31 +-
 cpp/include/raft/linalg/gemm.hpp              |   6 +-
 cpp/include/raft/linalg/lanczos.hpp           |  20 +-
 cpp/include/raft/linalg/map.hpp               |   4 +-
 cpp/include/raft/linalg/map_then_reduce.hpp   |   5 +-
 cpp/include/raft/linalg/matrix_vector_op.hpp  |   6 +-
 cpp/include/raft/linalg/strided_reduction.hpp |   3 +-
 cpp/include/raft/linalg/svd.hpp               |  11 +-
 cpp/test/linalg/eig_sel.cu                    |   6 +-
 30 files changed, 1099 insertions(+), 1103 deletions(-)

diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index e72d3e963f..9f669a5058 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -121,7 +121,8 @@ template <typename math_t>
 void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
                          void *workspace, int *n_bytes, cublasFillMode_t uplo,
                          cudaStream_t stream, math_t eps = -1) {
-  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo,
+                              stream, eps);
 }
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 591d605cb2..ad5279b1ad 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -57,7 +57,8 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N,
                         MainLambda main_op = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
                         FinalLambda final_op = raft::Nop<OutType>()) {
-  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
+                             reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 9c24514449..be7b8bb299 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -24,26 +24,26 @@ namespace detail {
 
 template <class math_t, typename IdxType>
 __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                    const math_t *singleScalarDev,
-                                    IdxType len) {
-IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-if (i < len) {
+                                      const math_t *singleScalarDev,
+                                      IdxType len) {
+  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) {
     outDev[i] = inDev[i] + *singleScalarDev;
-}
+  }
 }
 
 template <typename math_t, typename IdxType = int>
 void addDevScalar(math_t *outDev, const math_t *inDev,
-                const math_t *singleScalarDev, IdxType len,
-                cudaStream_t stream) {
-// TODO: block dimension has not been tuned
-dim3 block(256);
-dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-add_dev_scalar_kernel<math_t>
+                  const math_t *singleScalarDev, IdxType len,
+                  cudaStream_t stream) {
+  // TODO: block dimension has not been tuned
+  dim3 block(256);
+  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
+  add_dev_scalar_kernel<math_t>
     <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-CUDA_CHECK(cudaPeekAtLastError());
+  CUDA_CHECK(cudaPeekAtLastError());
 }
 
-} // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
index c0b670caf2..969c9dfa3e 100644
--- a/cpp/include/raft/linalg/detail/binary_op.cuh
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <raft/vectorized.cuh>
+#include <raft/vectorized.cuh>
 
- namespace raft {
- namespace linalg {
- namespace detail {
+namespace raft {
+namespace linalg {
+namespace detail {
 
-    template <typename InType, int VecLen, typename Lambda, typename IdxType,
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
           typename OutType>
 __global__ void binaryOpKernel(OutType *out, const InType *in1,
                                const InType *in2, IdxType len, Lambda op) {
@@ -73,19 +73,19 @@ void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
   uint64_t outAddr = uint64_t(out);
   if (16 / maxSize && bytes % 16 == 0 &&
       addressAligned(in1Addr, in2Addr, outAddr, 16)) {
-        binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (8 / maxSize && bytes % 8 == 0 &&
              addressAligned(in1Addr, in2Addr, outAddr, 8)) {
-              binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (4 / maxSize && bytes % 4 == 0 &&
-    addressAligned(in1Addr, in2Addr, outAddr, 4)) {
-               binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+             addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (2 / maxSize && bytes % 2 == 0 &&
-    addressAligned(in1Addr, in2Addr, outAddr, 2)) {
-              binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+             addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (1 / maxSize) {
     binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
@@ -96,6 +96,6 @@ void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
   }
 }
 
-    } // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
index beb3056e6d..49bb190836 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -28,8 +28,8 @@ namespace detail {
 
 template <typename math_t>
 void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
-                        void *workspace, int *n_bytes, cublasFillMode_t uplo,
-                        cudaStream_t stream, math_t eps = -1) {
+                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
+                         cudaStream_t stream, math_t eps = -1) {
   // The matrix A' is defined as:
   // A' = [[A_11, A_12]
   //       [A_21, A_22]]
@@ -50,8 +50,8 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
   const int align = 256;
   int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
-                ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
-                : 0;
+                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
+                 : 0;
   if (workspace == nullptr) {
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
@@ -84,7 +84,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
     CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
-                                        A_new, 1, A_new, 1, s, stream));
+                                         A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
@@ -114,6 +114,6 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   raft::update_device(L_22, &L_22_host, 1, stream);
 }
 
-} // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index f9c5223bdc..253b7032ed 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
 
- namespace raft {
- namespace linalg {
- namespace detail {
+namespace raft {
+namespace linalg {
+namespace detail {
 
 // Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
@@ -84,6 +84,6 @@ void coalescedReduction(OutType *dots, const InType *data, int D, int N,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-    } // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 2e4657ebc3..b04c813cd8 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -14,73 +14,73 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <raft/common/device_loads_stores.cuh>
+#include <raft/common/device_loads_stores.cuh>
 
- namespace raft {
- namespace linalg {
- namespace detail {
+namespace raft {
+namespace linalg {
+namespace detail {
 
-    template <typename DataT, typename IdxT, typename Policy,
-    bool isRowMajor = true>
+template <typename DataT, typename IdxT, typename Policy,
+          bool isRowMajor = true>
 struct Contractions_NT {
-protected:
-typedef Policy P;
+ protected:
+  typedef Policy P;
 
-/** number of rows in X */
-IdxT m;
-/** number of rows in Y */
-IdxT n;
-/** number of columns in X and Y */
-IdxT k;
-/** leading dimension in X */
-IdxT lda;
-/** leading dimension in Y */
-IdxT ldb;
-/** leading dimension in Output D */
-IdxT ldd;
+  /** number of rows in X */
+  IdxT m;
+  /** number of rows in Y */
+  IdxT n;
+  /** number of columns in X and Y */
+  IdxT k;
+  /** leading dimension in X */
+  IdxT lda;
+  /** leading dimension in Y */
+  IdxT ldb;
+  /** leading dimension in Output D */
+  IdxT ldd;
 
-/** current thread's global mem row id for X data */
-IdxT xrowid;
-/** current thread's global mem row id for Y data */
-IdxT yrowid;
-/** global memory pointer to X matrix */
-const DataT* x;
-/** global memory pointer to Y matrix */
-const DataT* y;
+  /** current thread's global mem row id for X data */
+  IdxT xrowid;
+  /** current thread's global mem row id for Y data */
+  IdxT yrowid;
+  /** global memory pointer to X matrix */
+  const DataT* x;
+  /** global memory pointer to Y matrix */
+  const DataT* y;
 
-/** current thread's smem row id */
-int srowid;
-/** current thread's smem column id */
-int scolid;
-/** current thread's accumulation row id */
-int accrowid;
-/** current thread's accumulation column id */
-int acccolid;
+  /** current thread's smem row id */
+  int srowid;
+  /** current thread's smem column id */
+  int scolid;
+  /** current thread's accumulation row id */
+  int accrowid;
+  /** current thread's accumulation column id */
+  int acccolid;
 
-/** base smem pointer for X data storage */
-DataT* sx;
-/** base smem pointer for Y data storage */
-DataT* sy;
-/** index pointing the correct smem page for writing after `ldgXY()` */
-int pageWr;
-/** index pointing the correct smem page for reading during `ldsXY()` */
-int pageRd;
+  /** base smem pointer for X data storage */
+  DataT* sx;
+  /** base smem pointer for Y data storage */
+  DataT* sy;
+  /** index pointing the correct smem page for writing after `ldgXY()` */
+  int pageWr;
+  /** index pointing the correct smem page for reading during `ldsXY()` */
+  int pageRd;
 
-/** block of X data loaded from smem after `ldsXY()` */
-DataT regx[P::AccRowsPerTh][P::Veclen];
-/** block of Y data loaded from smem after `ldsXY()` */
-DataT regy[P::AccColsPerTh][P::Veclen];
-/** block of X data loaded from global mem after `ldgXY()` */
-DataT ldgDataX[P::LdgPerThX][P::Veclen];
-/** block of Y data loaded from global mem after `ldgXY()` */
-DataT ldgDataY[P::LdgPerThY][P::Veclen];
+  /** block of X data loaded from smem after `ldsXY()` */
+  DataT regx[P::AccRowsPerTh][P::Veclen];
+  /** block of Y data loaded from smem after `ldsXY()` */
+  DataT regy[P::AccColsPerTh][P::Veclen];
+  /** block of X data loaded from global mem after `ldgXY()` */
+  DataT ldgDataX[P::LdgPerThX][P::Veclen];
+  /** block of Y data loaded from global mem after `ldgXY()` */
+  DataT ldgDataY[P::LdgPerThY][P::Veclen];
 
-static const DataT Zero = (DataT)0;
+  static const DataT Zero = (DataT)0;
 
-public:
-/**
+ public:
+  /**
 * @brief Ctor
 * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
 * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
@@ -89,27 +89,27 @@ public:
 * @param[in] _k number of cols of X and Y
 * @param[in] _smem shared memory region used during computations
 */
-DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-               IdxT _k, char* _smem)
-: m(_m),
-n(_n),
-k(_k),
-lda(_k),
-ldb(_k),
-xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow),
-yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow),
-x(_x + xrowid * lda),
-y(_y + yrowid * ldb),
-srowid(threadIdx.x / P::LdgThRow),
-scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
-accrowid(threadIdx.x / P::AccThCols),
-acccolid(threadIdx.x % P::AccThCols),
-sx((DataT*)_smem),
-sy(&(sx[P::SmemPageX])),
-pageWr(0),
-pageRd(0) {}
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+                     IdxT _k, char* _smem)
+    : m(_m),
+      n(_n),
+      k(_k),
+      lda(_k),
+      ldb(_k),
+      xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow),
+      yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow),
+      x(_x + xrowid * lda),
+      y(_y + yrowid * ldb),
+      srowid(threadIdx.x / P::LdgThRow),
+      scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
+      accrowid(threadIdx.x / P::AccThCols),
+      acccolid(threadIdx.x % P::AccThCols),
+      sx((DataT*)_smem),
+      sy(&(sx[P::SmemPageX])),
+      pageWr(0),
+      pageRd(0) {}
 
-/**
+  /**
 * @brief Ctor
 * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
 * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
@@ -118,186 +118,186 @@ pageRd(0) {}
 * @param[in] _k number of cols of X and Y
 * @param[in] _smem shared memory region used during computations
 */
-DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-               IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
-: m(_m),
-n(_n),
-k(_k),
-lda(_lda),
-ldb(_ldb),
-ldd(_ldd),
-srowid(threadIdx.x / P::LdgThRow),
-scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
-accrowid(threadIdx.x / P::AccThCols),
-acccolid(threadIdx.x % P::AccThCols),
-sx((DataT*)_smem),
-sy(&(sx[P::SmemPageX])),
-pageWr(0),
-pageRd(0) {
-if (isRowMajor) {
-xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
-yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-x = _x + xrowid * lda;
-y = _y + yrowid * ldb;
-} else {
-xrowid = IdxT(blockIdx.y) * P::Mblk;
-yrowid = IdxT(blockIdx.x) * P::Nblk;
-x = _x + xrowid + srowid * lda;
-y = _y + yrowid + srowid * ldb;
-}
-}
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
+    : m(_m),
+      n(_n),
+      k(_k),
+      lda(_lda),
+      ldb(_ldb),
+      ldd(_ldd),
+      srowid(threadIdx.x / P::LdgThRow),
+      scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
+      accrowid(threadIdx.x / P::AccThCols),
+      acccolid(threadIdx.x % P::AccThCols),
+      sx((DataT*)_smem),
+      sy(&(sx[P::SmemPageX])),
+      pageWr(0),
+      pageRd(0) {
+    if (isRowMajor) {
+      xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
+      yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
+      x = _x + xrowid * lda;
+      y = _y + yrowid * ldb;
+    } else {
+      xrowid = IdxT(blockIdx.y) * P::Mblk;
+      yrowid = IdxT(blockIdx.x) * P::Nblk;
+      x = _x + xrowid + srowid * lda;
+      y = _y + yrowid + srowid * ldb;
+    }
+  }
 
-protected:
-/**
+ protected:
+  /**
 * @brief Load current block of X/Y from global memory to registers
 * @param[in] kidx current start index of k to be loaded
 */
-DI void ldgXY(IdxT kidx) {
-ldgX(kidx);
-ldgY(kidx);
-}
+  DI void ldgXY(IdxT kidx) {
+    ldgX(kidx);
+    ldgY(kidx);
+  }
 
-/**
+  /**
 * @brief Store current block of X/Y from registers to smem
 * @param[in] kidx current start index of k to be loaded
 */
-DI void stsXY() {
-stsX(sx + pageWr * P::SmemPage);
-stsY(sy + pageWr * P::SmemPage);
-}
+  DI void stsXY() {
+    stsX(sx + pageWr * P::SmemPage);
+    stsY(sy + pageWr * P::SmemPage);
+  }
 
-/**
+  /**
 * @brief Load X and Y block from shared memory to registers
 * @param[in] kidx k value from the current k-block to be loaded from smem
 */
-DI void ldsXY(int kidx) {
-ldsX(kidx, sx + pageRd * P::SmemPage);
-ldsY(kidx, sy + pageRd * P::SmemPage);
-}
+  DI void ldsXY(int kidx) {
+    ldsX(kidx, sx + pageRd * P::SmemPage);
+    ldsY(kidx, sy + pageRd * P::SmemPage);
+  }
 
-private:
-DI void ldgX(IdxT kidx) {
-if (isRowMajor) {
-auto numRows = m;
-auto koffset = kidx + scolid;
+ private:
+  DI void ldgX(IdxT kidx) {
+    if (isRowMajor) {
+      auto numRows = m;
+      auto koffset = kidx + scolid;
 #pragma unroll
-for (int i = 0; i < P::LdgPerThX; ++i) {
-  if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) {
-    ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset);
-  } else {
+      for (int i = 0; i < P::LdgPerThX; ++i) {
+        if (koffset < lda && (xrowid + i * P::LdgRowsX) < numRows) {
+          ldg(ldgDataX[i], x + i * P::LdgRowsX * lda + koffset);
+        } else {
 #pragma unroll
-    for (int j = 0; j < P::Veclen; ++j) {
-      ldgDataX[i][j] = Zero;
-    }
-  }
-}
-} else {
-const auto numRows = k;
-auto koffset = scolid;
+          for (int j = 0; j < P::Veclen; ++j) {
+            ldgDataX[i][j] = Zero;
+          }
+        }
+      }
+    } else {
+      const auto numRows = k;
+      auto koffset = scolid;
 #pragma unroll
-for (int i = 0; i < P::LdgPerThX; ++i) {
-  if ((koffset + xrowid) < lda &&
-      (srowid + kidx + i * P::LdgRowsX) < numRows) {
-    ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
-  } else {
+      for (int i = 0; i < P::LdgPerThX; ++i) {
+        if ((koffset + xrowid) < lda &&
+            (srowid + kidx + i * P::LdgRowsX) < numRows) {
+          ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
+        } else {
 #pragma unroll
-    for (int j = 0; j < P::Veclen; ++j) {
-      ldgDataX[i][j] = Zero;
+          for (int j = 0; j < P::Veclen; ++j) {
+            ldgDataX[i][j] = Zero;
+          }
+        }
+      }
     }
   }
-}
-}
-}
 
-DI void ldgY(IdxT kidx) {
-if (isRowMajor) {
-auto numRows = n;
-auto koffset = kidx + scolid;
+  DI void ldgY(IdxT kidx) {
+    if (isRowMajor) {
+      auto numRows = n;
+      auto koffset = kidx + scolid;
 #pragma unroll
-for (int i = 0; i < P::LdgPerThY; ++i) {
-  if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) {
-    ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset);
-  } else {
+      for (int i = 0; i < P::LdgPerThY; ++i) {
+        if (koffset < ldb && (yrowid + i * P::LdgRowsY) < numRows) {
+          ldg(ldgDataY[i], y + i * P::LdgRowsY * ldb + koffset);
+        } else {
 #pragma unroll
-    for (int j = 0; j < P::Veclen; ++j) {
-      ldgDataY[i][j] = Zero;
-    }
-  }
-}
-} else {
-auto numRows = k;
-auto koffset = scolid;
+          for (int j = 0; j < P::Veclen; ++j) {
+            ldgDataY[i][j] = Zero;
+          }
+        }
+      }
+    } else {
+      auto numRows = k;
+      auto koffset = scolid;
 #pragma unroll
-for (int i = 0; i < P::LdgPerThY; ++i) {
-  if ((koffset + yrowid) < ldb &&
-      (srowid + kidx + i * P::LdgRowsY) < numRows) {
-    ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
-  } else {
+      for (int i = 0; i < P::LdgPerThY; ++i) {
+        if ((koffset + yrowid) < ldb &&
+            (srowid + kidx + i * P::LdgRowsY) < numRows) {
+          ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
+        } else {
 #pragma unroll
-    for (int j = 0; j < P::Veclen; ++j) {
-      ldgDataY[i][j] = Zero;
+          for (int j = 0; j < P::Veclen; ++j) {
+            ldgDataY[i][j] = Zero;
+          }
+        }
+      }
     }
   }
-}
-}
-}
 
-DI void stsX(DataT* smem) {
-auto* saddr = smem + srowid * P::SmemStride + scolid;
+  DI void stsX(DataT* smem) {
+    auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
-for (int i = 0; i < P::LdgPerThX; ++i) {
-sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]);
-}
-}
+    for (int i = 0; i < P::LdgPerThX; ++i) {
+      sts(saddr + i * P::LdgRowsX * P::SmemStride, ldgDataX[i]);
+    }
+  }
 
-DI void stsY(DataT* smem) {
-auto* saddr = smem + srowid * P::SmemStride + scolid;
+  DI void stsY(DataT* smem) {
+    auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
-for (int i = 0; i < P::LdgPerThY; ++i) {
-sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]);
-}
-}
+    for (int i = 0; i < P::LdgPerThY; ++i) {
+      sts(saddr + i * P::LdgRowsY * P::SmemStride, ldgDataY[i]);
+    }
+  }
 
-DI void ldsX(int kidx, DataT* smem) {
-if (isRowMajor) {
-auto* saddr = smem + accrowid * P::SmemStride + kidx;
+  DI void ldsX(int kidx, DataT* smem) {
+    if (isRowMajor) {
+      auto* saddr = smem + accrowid * P::SmemStride + kidx;
 #pragma unroll
-for (int i = 0; i < P::AccRowsPerTh; ++i) {
-  lds(regx[i], saddr + i * P::AccThRows * P::SmemStride);
-}
-} else {
-auto* saddr = smem + accrowid + kidx * P::SmemStride;
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        lds(regx[i], saddr + i * P::AccThRows * P::SmemStride);
+      }
+    } else {
+      auto* saddr = smem + accrowid + kidx * P::SmemStride;
 #pragma unroll
-for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-  for (int v = 0; v < P::Veclen; ++v) {
-    regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride];
+        for (int v = 0; v < P::Veclen; ++v) {
+          regx[i][v] = saddr[i * P::AccThRows + v * P::SmemStride];
+        }
+      }
+    }
   }
-}
-}
-}
 
-DI void ldsY(int kidx, DataT* smem) {
-if (isRowMajor) {
-auto* saddr = smem + acccolid * P::SmemStride + kidx;
+  DI void ldsY(int kidx, DataT* smem) {
+    if (isRowMajor) {
+      auto* saddr = smem + acccolid * P::SmemStride + kidx;
 #pragma unroll
-for (int i = 0; i < P::AccColsPerTh; ++i) {
-  lds(regy[i], saddr + i * P::AccThCols * P::SmemStride);
-}
-} else {
-auto* saddr = smem + acccolid + kidx * P::SmemStride;
+      for (int i = 0; i < P::AccColsPerTh; ++i) {
+        lds(regy[i], saddr + i * P::AccThCols * P::SmemStride);
+      }
+    } else {
+      auto* saddr = smem + acccolid + kidx * P::SmemStride;
 #pragma unroll
-for (int i = 0; i < P::AccColsPerTh; ++i) {
+      for (int i = 0; i < P::AccColsPerTh; ++i) {
 #pragma unroll
-  for (int v = 0; v < P::Veclen; ++v) {
-    regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride];
+        for (int v = 0; v < P::Veclen; ++v) {
+          regy[i][v] = saddr[i * P::AccThCols + v * P::SmemStride];
+        }
+      }
+    }
   }
-}
-}
-}
 
 };  // struct Contractions_NT
 
-} // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index 9e0966e67b..c37f3c92a5 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -202,6 +202,6 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
   CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
 }
 
-} // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
index 275e5f5917..fec2e27228 100644
--- a/cpp/include/raft/linalg/detail/functional.cuh
+++ b/cpp/include/raft/linalg/detail/functional.cuh
@@ -14,67 +14,61 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <thrust/functional.h>
+#include <thrust/functional.h>
 
- namespace raft {
- namespace linalg {
- namespace detail {
+namespace raft {
+namespace linalg {
+namespace detail {
 
 template <typename ArgType, typename ReturnType = ArgType>
 struct divides_scalar {
+ public:
+  divides_scalar(ArgType scalar) : scalar_(scalar) {}
 
-public:
-    divides_scalar(ArgType scalar) : scalar_(scalar) {} 
+  __host__ __device__ inline ReturnType operator()(ArgType in) {
+    return in / scalar_;
+  }
 
-    __host__ __device__ inline ReturnType operator()(ArgType in) {
-        return in / scalar_;
-    }
-
-private:
-    ArgType scalar_;
+ private:
+  ArgType scalar_;
 };
 
 template <typename ArgType, typename ReturnType = ArgType>
 struct adds_scalar {
+ public:
+  adds_scalar(ArgType scalar) : scalar_(scalar) {}
 
-public:
-    adds_scalar(ArgType scalar) : scalar_(scalar) {} 
-
-    __host__ __device__ inline ReturnType operator()(ArgType in) {
-        return in + scalar_;
-    }
+  __host__ __device__ inline ReturnType operator()(ArgType in) {
+    return in + scalar_;
+  }
 
-private:
-    ArgType scalar_;
+ private:
+  ArgType scalar_;
 };
 
 template <typename ArgType, typename ReturnType = ArgType>
 struct multiplies_scalar {
+ public:
+  multiplies_scalar(ArgType scalar) : scalar_(scalar) {}
 
-public:
-    multiplies_scalar(ArgType scalar) : scalar_(scalar) {} 
-
-    __host__ __device__ inline ReturnType operator()(ArgType in) {
-        return in * scalar_;
-    }
+  __host__ __device__ inline ReturnType operator()(ArgType in) {
+    return in * scalar_;
+  }
 
-private:
-    ArgType scalar_;
+ private:
+  ArgType scalar_;
 };
 
 template <typename ArgType, typename ReturnType = ArgType>
 struct divides_check_zero {
-
-public:
-    __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) {
-        return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
-    }
-
+ public:
+  __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) {
+    return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
+  }
 };
 
-
-} // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 1a09e86532..c565d69f86 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -119,6 +119,6 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
                           b, ldb, &beta, c, ldc, stream));
 }
 
-} // namespace detail
-} // namespace linalg
-} // namespace raft
\ No newline at end of file
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index e1672ef23c..dc75b70509 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -596,7 +596,7 @@ static int lanczosRestart(
   return 0;
 }
 
-} // namespace detail
+}  // namespace detail
 }  // namespace spectral
 
 namespace detail {
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index faa899492e..0e649fb937 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -14,39 +14,38 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <cub/cub.cuh>
- #include <raft/cuda_utils.cuh>
- #include <raft/handle.hpp>
- #include <raft/vectorized.cuh>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
- 
- template <typename InType, typename OutType, typename MapOp, int TPB,
-           typename... Args>
- __global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
-                           Args... args) {
-   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
- 
-   if (idx < len) {
-     out[idx] = map(in[idx], args[idx]...);
-   }
- }
- 
- template <typename InType, typename OutType, typename MapOp, int TPB,
-           typename... Args>
- void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-              const InType *in, Args... args) {
-   const int nblks = raft::ceildiv(len, (size_t)TPB);
-   mapKernel<InType, OutType, MapOp, TPB, Args...>
-     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
-   CUDA_CHECK(cudaPeekAtLastError());
- }
- 
- }   // namespace detail
- }   // namespace linalg
- };  // namespace raft
- 
\ No newline at end of file
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, typename OutType, typename MapOp, int TPB,
+          typename... Args>
+__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
+                          Args... args) {
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+
+  if (idx < len) {
+    out[idx] = map(in[idx], args[idx]...);
+  }
+}
+
+template <typename InType, typename OutType, typename MapOp, int TPB,
+          typename... Args>
+void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
+             const InType *in, Args... args) {
+  const int nblks = raft::ceildiv(len, (size_t)TPB);
+  mapKernel<InType, OutType, MapOp, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 98a08713dc..a7031bc48f 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -29,51 +29,51 @@ struct sum_tag {};
 
 template <typename InType, typename OutType, int TPB>
 __device__ void reduce(OutType *out, const InType acc, sum_tag) {
-typedef cub::BlockReduce<InType, TPB> BlockReduce;
-__shared__ typename BlockReduce::TempStorage temp_storage;
-OutType tmp = BlockReduce(temp_storage).Sum(acc);
-if (threadIdx.x == 0) {
+  typedef cub::BlockReduce<InType, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType tmp = BlockReduce(temp_storage).Sum(acc);
+  if (threadIdx.x == 0) {
     raft::myAtomicAdd(out, tmp);
-}
+  }
 }
 
 template <typename InType, typename OutType, int TPB, typename ReduceLambda>
 __device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
-typedef cub::BlockReduce<InType, TPB> BlockReduce;
-__shared__ typename BlockReduce::TempStorage temp_storage;
-OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
-if (threadIdx.x == 0) {
+  typedef cub::BlockReduce<InType, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
+  if (threadIdx.x == 0) {
     raft::myAtomicReduce(out, tmp, op);
-}
+  }
 }
 
 template <typename InType, typename OutType, typename MapOp,
-        typename ReduceLambda, int TPB, typename... Args>
+          typename ReduceLambda, int TPB, typename... Args>
 __global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
                                     MapOp map, ReduceLambda op,
                                     const InType *in, Args... args) {
-OutType acc = neutral;
-auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+  OutType acc = neutral;
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-if (idx < len) {
+  if (idx < len) {
     acc = map(in[idx], args[idx]...);
-}
+  }
 
-__syncthreads();
+  __syncthreads();
 
-reduce<InType, OutType, TPB>(out, acc, op);
+  reduce<InType, OutType, TPB>(out, acc, op);
 }
 
 template <typename InType, typename OutType, typename MapOp,
-        typename ReduceLambda, int TPB, typename... Args>
+          typename ReduceLambda, int TPB, typename... Args>
 void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
-                    ReduceLambda op, cudaStream_t stream, const InType *in,
-                    Args... args) {
-raft::update_device(out, &neutral, 1, stream);
-const int nblks = raft::ceildiv(len, (size_t)TPB);
-mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
+                       ReduceLambda op, cudaStream_t stream, const InType *in,
+                       Args... args) {
+  raft::update_device(out, &neutral, 1, stream);
+  const int nblks = raft::ceildiv(len, (size_t)TPB);
+  mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, neutral, map, op, in, args...);
-CUDA_CHECK(cudaPeekAtLastError());
+  CUDA_CHECK(cudaPeekAtLastError());
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index d46a7833e1..17f748248b 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -14,182 +14,181 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <raft/cuda_utils.cuh>
- #include <raft/vectorized.cuh>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
- 
- template <typename Type, int veclen_, typename Lambda, typename IdxType>
- __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                      const Type *vector, IdxType D, IdxType N,
-                                      bool rowMajor, bool bcastAlongRows,
-                                      Lambda op) {
-   typedef TxN_t<Type, veclen_> VecType;
-   IdxType len = N * D;
-   IdxType idx = threadIdx.x;
-   idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-   idx *= VecType::Ratio;
-   if (idx >= len) return;
-   IdxType vIdx;
-   VecType mat, vec;
-   ///@todo: yikes! use fast-int-div here.
-   ///@todo: shared mem for vector could help with perf
-   if (rowMajor && bcastAlongRows) {
-     vIdx = idx % D;
-     vec.load(vector, vIdx);
-   } else if (!rowMajor && !bcastAlongRows) {
-     vIdx = idx % N;
-     vec.load(vector, vIdx);
-   } else if (rowMajor && !bcastAlongRows) {
-     vIdx = idx / D;
-     vec.fill(vector[vIdx]);
-   } else {
-     vIdx = idx / N;
-     vec.fill(vector[vIdx]);
-   }
-   mat.load(matrix, idx);
- #pragma unroll
-   for (int i = 0; i < VecType::Ratio; ++i)
-     mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-   mat.store(out, idx);
- }
- 
- template <typename Type, int veclen_, typename Lambda, typename IdxType,
-           int TPB>
- void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
-                         IdxType D, IdxType N, bool rowMajor,
-                         bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-   IdxType len = N * D;
-   IdxType nblks =
-     raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-     <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
-                                 bcastAlongRows, op);
-   CUDA_CHECK(cudaPeekAtLastError());
- }
- 
- template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
- void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
-                     IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
-                     cudaStream_t stream) {
-   IdxType stride = rowMajor ? D : N;
-   size_t stride_bytes = stride * sizeof(Type);
- 
-   auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-     return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
-            reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
-   };
- 
-   if (test_aligned_access(16)) {
-     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (test_aligned_access(8)) {
-     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (test_aligned_access(4)) {
-     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (test_aligned_access(2)) {
-     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (1 / sizeof(Type)) {
-     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else {
-     matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-   }
- }
- 
- ///@todo: come up with a cleaner interface to support these cases in future!
- 
- template <typename Type, int veclen_, typename Lambda, typename IdxType>
- __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                      const Type *vector1, const Type *vector2,
-                                      IdxType D, IdxType N, bool rowMajor,
-                                      bool bcastAlongRows, Lambda op) {
-   typedef TxN_t<Type, veclen_> VecType;
-   IdxType len = N * D;
-   IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
-   if (idx >= len) return;
-   IdxType vIdx;
-   VecType mat, vec1, vec2;
-   ///@todo: yikes! use fast-int-div here.
-   ///@todo: shared mem for vector could help with perf
-   if (rowMajor && bcastAlongRows) {
-     vIdx = idx % D;
-     vec1.load(vector1, vIdx);
-     vec2.load(vector2, vIdx);
-   } else if (!rowMajor && !bcastAlongRows) {
-     vIdx = idx % N;
-     vec1.load(vector1, vIdx);
-     vec2.load(vector2, vIdx);
-   } else if (rowMajor && !bcastAlongRows) {
-     vIdx = idx / D;
-     vec1.fill(vector1[vIdx]);
-     vec2.fill(vector2[vIdx]);
-   } else {
-     vIdx = idx / N;
-     vec1.fill(vector1[vIdx]);
-     vec2.fill(vector2[vIdx]);
-   }
-   mat.load(matrix, idx);
- #pragma unroll
-   for (int i = 0; i < VecType::Ratio; ++i)
-     mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
-   mat.store(out, idx);
- }
- 
- template <typename Type, int veclen_, typename Lambda, typename IdxType,
-           int TPB>
- void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
-                         const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                         bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-   IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
-   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-     <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
-                                 bcastAlongRows, op);
-   CUDA_CHECK(cudaPeekAtLastError());
- }
- 
- template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
- void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
-                     const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                     bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-   IdxType stride = rowMajor ? D : N;
-   size_t stride_bytes = stride * sizeof(Type);
- 
-   auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-     return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
-            reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
-   };
- 
-   if (test_aligned_access(16)) {
-     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (test_aligned_access(8)) {
-     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (test_aligned_access(4)) {
-     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (test_aligned_access(2)) {
-     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else if (1 / sizeof(Type)) {
-     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-   } else {
-     matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-   }
- }
- 
- };  // end namespace detail 
- };  // end namespace linalg
- };  // end namespace raft
- 
\ No newline at end of file
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType>
+__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
+                                     const Type *vector, IdxType D, IdxType N,
+                                     bool rowMajor, bool bcastAlongRows,
+                                     Lambda op) {
+  typedef TxN_t<Type, veclen_> VecType;
+  IdxType len = N * D;
+  IdxType idx = threadIdx.x;
+  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  IdxType vIdx;
+  VecType mat, vec;
+  ///@todo: yikes! use fast-int-div here.
+  ///@todo: shared mem for vector could help with perf
+  if (rowMajor && bcastAlongRows) {
+    vIdx = idx % D;
+    vec.load(vector, vIdx);
+  } else if (!rowMajor && !bcastAlongRows) {
+    vIdx = idx % N;
+    vec.load(vector, vIdx);
+  } else if (rowMajor && !bcastAlongRows) {
+    vIdx = idx / D;
+    vec.fill(vector[vIdx]);
+  } else {
+    vIdx = idx / N;
+    vec.fill(vector[vIdx]);
+  }
+  mat.load(matrix, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i)
+    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
+  mat.store(out, idx);
+}
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType,
+          int TPB>
+void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
+                        IdxType D, IdxType N, bool rowMajor,
+                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType len = N * D;
+  IdxType nblks =
+    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
+                                bcastAlongRows, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
+                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
+                    cudaStream_t stream) {
+  IdxType stride = rowMajor ? D : N;
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
+    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (test_aligned_access(8)) {
+    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (test_aligned_access(4)) {
+    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (test_aligned_access(2)) {
+    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (1 / sizeof(Type)) {
+    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else {
+    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
+      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  }
+}
+
+///@todo: come up with a cleaner interface to support these cases in future!
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType>
+__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
+                                     const Type *vector1, const Type *vector2,
+                                     IdxType D, IdxType N, bool rowMajor,
+                                     bool bcastAlongRows, Lambda op) {
+  typedef TxN_t<Type, veclen_> VecType;
+  IdxType len = N * D;
+  IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
+  if (idx >= len) return;
+  IdxType vIdx;
+  VecType mat, vec1, vec2;
+  ///@todo: yikes! use fast-int-div here.
+  ///@todo: shared mem for vector could help with perf
+  if (rowMajor && bcastAlongRows) {
+    vIdx = idx % D;
+    vec1.load(vector1, vIdx);
+    vec2.load(vector2, vIdx);
+  } else if (!rowMajor && !bcastAlongRows) {
+    vIdx = idx % N;
+    vec1.load(vector1, vIdx);
+    vec2.load(vector2, vIdx);
+  } else if (rowMajor && !bcastAlongRows) {
+    vIdx = idx / D;
+    vec1.fill(vector1[vIdx]);
+    vec2.fill(vector2[vIdx]);
+  } else {
+    vIdx = idx / N;
+    vec1.fill(vector1[vIdx]);
+    vec2.fill(vector2[vIdx]);
+  }
+  mat.load(matrix, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i)
+    mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
+  mat.store(out, idx);
+}
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType,
+          int TPB>
+void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
+                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
+                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
+  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
+                                bcastAlongRows, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
+                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
+                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType stride = rowMajor ? D : N;
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
+    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (test_aligned_access(8)) {
+    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (test_aligned_access(4)) {
+    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (test_aligned_access(2)) {
+    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else if (1 / sizeof(Type)) {
+    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  } else {
+    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
+      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 1f1e36f426..c6bb99f8f2 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -14,96 +14,95 @@
  * limitations under the License.
  */
 
- #pragma once
-
- #include <raft/linalg/cublas_wrappers.h>
- #include <raft/linalg/cusolver_wrappers.h>
- #include <raft/matrix/matrix.hpp>
- #include <rmm/device_scalar.hpp>
- #include <rmm/device_uvector.hpp>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
- 
- template <typename math_t>
- void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
+#pragma once
+
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/matrix/matrix.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
+            int n_rows, int n_cols, cudaStream_t stream) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int m = n_rows, n = n_cols;
+  int k = min(m, n);
+  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  rmm::device_uvector<math_t> tau(k, stream);
+  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
+
+  rmm::device_scalar<int> devInfo(stream);
+  int Lwork;
+
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
+  rmm::device_uvector<math_t> workspace(Lwork, stream);
+  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
+                                 workspace.data(), Lwork, devInfo.data(),
+                                 stream));
+  /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
+#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
+  CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+  CUSOLVER_CHECK(
+    cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  workspace.resize(Lwork, stream);
+  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
+                                 workspace.data(), Lwork, devInfo.data(),
+                                 stream));
+}
+
+template <typename math_t>
+void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
              int n_rows, int n_cols, cudaStream_t stream) {
-   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
- 
-   int m = n_rows, n = n_cols;
-   int k = min(m, n);
-   CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
-                              cudaMemcpyDeviceToDevice, stream));
- 
-   rmm::device_uvector<math_t> tau(k, stream);
-   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
- 
-   rmm::device_scalar<int> devInfo(stream);
-   int Lwork;
- 
-   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
-   rmm::device_uvector<math_t> workspace(Lwork, stream);
-   CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
-                                  workspace.data(), Lwork, devInfo.data(),
-                                  stream));
-   /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
- #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-   CUDA_CHECK(cudaDeviceSynchronize());
- #endif
-   CUSOLVER_CHECK(
-     cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
-   workspace.resize(Lwork, stream);
-   CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
-                                  workspace.data(), Lwork, devInfo.data(),
-                                  stream));
- }
- 
- template <typename math_t>
- void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
-              int n_rows, int n_cols, cudaStream_t stream) {
-   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
- 
-   int m = n_rows, n = n_cols;
-   rmm::device_uvector<math_t> R_full(m * n, stream);
-   rmm::device_uvector<math_t> tau(min(m, n), stream);
-   CUDA_CHECK(
-     cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
-   int R_full_nrows = m, R_full_ncols = n;
-   CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
-                              cudaMemcpyDeviceToDevice, stream));
- 
-   int Lwork;
-   rmm::device_scalar<int> devInfo(stream);
- 
-   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
-                                             R_full_ncols, R_full.data(),
-                                             R_full_nrows, &Lwork));
-   rmm::device_uvector<math_t> workspace(Lwork, stream);
-   CUSOLVER_CHECK(cusolverDngeqrf(
-     cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
-     tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
-   // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
- #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-   CUDA_CHECK(cudaDeviceSynchronize());
- #endif
- 
-   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
- 
-   CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
-                              cudaMemcpyDeviceToDevice, stream));
-   int Q_nrows = m, Q_ncols = n;
- 
-   CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
-                                             min(Q_ncols, Q_nrows), Q, Q_nrows,
-                                             tau.data(), &Lwork));
-   workspace.resize(Lwork, stream);
-   CUSOLVER_CHECK(cusolverDnorgqr(
-     cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
-     workspace.data(), Lwork, devInfo.data(), stream));
- }
- 
- };  // namespace detail
- };  // namespace linalg
- };  // namespace raft
- 
\ No newline at end of file
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int m = n_rows, n = n_cols;
+  rmm::device_uvector<math_t> R_full(m * n, stream);
+  rmm::device_uvector<math_t> tau(min(m, n), stream);
+  CUDA_CHECK(
+    cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  int R_full_nrows = m, R_full_ncols = n;
+  CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  int Lwork;
+  rmm::device_scalar<int> devInfo(stream);
+
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
+                                            R_full_ncols, R_full.data(),
+                                            R_full_nrows, &Lwork));
+  rmm::device_uvector<math_t> workspace(Lwork, stream);
+  CUSOLVER_CHECK(cusolverDngeqrf(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
+    tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
+#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
+  CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+  raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
+  int Q_nrows = m, Q_ncols = n;
+
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
+                                            min(Q_ncols, Q_nrows), Q, Q_nrows,
+                                            tau.data(), &Lwork));
+  workspace.resize(Lwork, stream);
+  CUSOLVER_CHECK(cusolverDnorgqr(
+    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
+    workspace.data(), Lwork, devInfo.data(), stream));
+}
+
+};  // namespace detail
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 8de0e6ed5c..2819bde8d2 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -14,130 +14,129 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <cub/cub.cuh>
- #include <raft/cuda_utils.cuh>
- #include <type_traits>
- #include <raft/linalg/unary_op.hpp>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.hpp>
+#include <type_traits>
 
- // Kernel to perform reductions along the strided dimension
- // of the matrix, i.e. reduce along columns for row major or reduce along rows
- // for column major layout
- template <typename Type, typename MainLambda>
- __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
-                                        int N, Type init, MainLambda main_op) {
-   // Thread reduction
-   Type thread_data = Type(init);
-   int colStart = blockIdx.x * blockDim.x + threadIdx.x;
-   if (colStart < D) {
-     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-     int stride = blockDim.y * gridDim.y;
-     for (int j = rowStart; j < N; j += stride) {
-       int idx = colStart + j * D;
-       thread_data += main_op(data[idx], j);
-     }
-   }
- 
-   // Block reduction
-   extern __shared__ char tmp[];  // One element per thread in block
-   Type *temp = (Type *)tmp;      // Cast to desired type
-   int myidx = threadIdx.x + blockDim.x * threadIdx.y;
-   temp[myidx] = thread_data;
-   __syncthreads();
-   for (int j = blockDim.y / 2; j > 0; j /= 2) {
-     if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x];
-     __syncthreads();
-   }
- 
-   // Grid reduction
-   if ((colStart < D) && (threadIdx.y == 0))
-     raft::myAtomicAdd(dots + colStart, temp[myidx]);
- }
- 
- // Kernel to perform reductions along the strided dimension
- // of the matrix, i.e. reduce along columns for row major or reduce along rows
- // for column major layout
- template <typename InType, typename OutType, typename IdxType,
-           typename MainLambda, typename ReduceLambda>
- __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
-                                        int N, OutType init, MainLambda main_op,
-                                        ReduceLambda reduce_op) {
-   // Thread reduction
-   OutType thread_data = init;
-   IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
-   if (colStart < D) {
-     IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-     IdxType stride = blockDim.y * gridDim.y;
-     for (IdxType j = rowStart; j < N; j += stride) {
-       IdxType idx = colStart + j * D;
-       thread_data = reduce_op(thread_data, main_op(data[idx], j));
-     }
-   }
- 
-   // Block reduction
-   extern __shared__ char tmp[];  // One element per thread in block
-   auto *temp = (OutType *)tmp;   // Cast to desired type
-   IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
-   temp[myidx] = thread_data;
-   __syncthreads();
-   for (int j = blockDim.y / 2; j > 0; j /= 2) {
-     if (threadIdx.y < j)
-       temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
-     __syncthreads();
-   }
- 
-   // Grid reduction
-   if ((colStart < D) && (threadIdx.y == 0))
-     raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op);
- }
- 
- template <typename InType, typename OutType = InType, typename IdxType = int,
-           typename MainLambda = raft::Nop<InType, IdxType>,
-           typename ReduceLambda = raft::Sum<OutType>,
-           typename FinalLambda = raft::Nop<OutType>>
- void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
-                       OutType init, cudaStream_t stream, bool inplace = false,
-                       MainLambda main_op = raft::Nop<InType, IdxType>(),
-                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                       FinalLambda final_op = raft::Nop<OutType>()) {
-   ///@todo: this extra should go away once we have eliminated the need
-   /// for atomics in stridedKernel (redesign for this is already underway)
-   if (!inplace)
-     raft::linalg::unaryOp(
-       dots, dots, D, [init] __device__(OutType a) { return init; }, stream);
- 
-   // Arbitrary numbers for now, probably need to tune
-   const dim3 thrds(32, 16);
-   IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-   elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
-   const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
-                    raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
-   const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
- 
-   ///@todo: this complication should go away once we have eliminated the need
-   /// for atomics in stridedKernel (redesign for this is already underway)
-   if constexpr (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
-                 std::is_same<InType, OutType>::value)
-     stridedSummationKernel<InType>
-       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
-   else
-     stridedReductionKernel<InType, OutType, IdxType>
-       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
-                                             reduce_op);
- 
-   ///@todo: this complication should go away once we have eliminated the need
-   /// for atomics in stridedKernel (redesign for this is already underway)
-   // Perform final op on output data
-   if (!std::is_same<FinalLambda, raft::Nop<OutType>>::value)
-     raft::linalg::unaryOp(dots, dots, D, final_op, stream);
- }
- 
- };  // end namespace detail   
- };  // end namespace linalg
- };  // end namespace raft
- 
\ No newline at end of file
+namespace raft {
+namespace linalg {
+namespace detail {
+
+// Kernel to perform reductions along the strided dimension
+// of the matrix, i.e. reduce along columns for row major or reduce along rows
+// for column major layout
+template <typename Type, typename MainLambda>
+__global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
+                                       int N, Type init, MainLambda main_op) {
+  // Thread reduction
+  Type thread_data = Type(init);
+  int colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  if (colStart < D) {
+    int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
+    int stride = blockDim.y * gridDim.y;
+    for (int j = rowStart; j < N; j += stride) {
+      int idx = colStart + j * D;
+      thread_data += main_op(data[idx], j);
+    }
+  }
+
+  // Block reduction
+  extern __shared__ char tmp[];  // One element per thread in block
+  Type *temp = (Type *)tmp;      // Cast to desired type
+  int myidx = threadIdx.x + blockDim.x * threadIdx.y;
+  temp[myidx] = thread_data;
+  __syncthreads();
+  for (int j = blockDim.y / 2; j > 0; j /= 2) {
+    if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x];
+    __syncthreads();
+  }
+
+  // Grid reduction
+  if ((colStart < D) && (threadIdx.y == 0))
+    raft::myAtomicAdd(dots + colStart, temp[myidx]);
+}
+
+// Kernel to perform reductions along the strided dimension
+// of the matrix, i.e. reduce along columns for row major or reduce along rows
+// for column major layout
+template <typename InType, typename OutType, typename IdxType,
+          typename MainLambda, typename ReduceLambda>
+__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
+                                       int N, OutType init, MainLambda main_op,
+                                       ReduceLambda reduce_op) {
+  // Thread reduction
+  OutType thread_data = init;
+  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  if (colStart < D) {
+    IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
+    IdxType stride = blockDim.y * gridDim.y;
+    for (IdxType j = rowStart; j < N; j += stride) {
+      IdxType idx = colStart + j * D;
+      thread_data = reduce_op(thread_data, main_op(data[idx], j));
+    }
+  }
+
+  // Block reduction
+  extern __shared__ char tmp[];  // One element per thread in block
+  auto *temp = (OutType *)tmp;   // Cast to desired type
+  IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
+  temp[myidx] = thread_data;
+  __syncthreads();
+  for (int j = blockDim.y / 2; j > 0; j /= 2) {
+    if (threadIdx.y < j)
+      temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
+    __syncthreads();
+  }
+
+  // Grid reduction
+  if ((colStart < D) && (threadIdx.y == 0))
+    raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op);
+}
+
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda = raft::Nop<OutType>>
+void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
+                      OutType init, cudaStream_t stream, bool inplace = false,
+                      MainLambda main_op = raft::Nop<InType, IdxType>(),
+                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                      FinalLambda final_op = raft::Nop<OutType>()) {
+  ///@todo: this extra should go away once we have eliminated the need
+  /// for atomics in stridedKernel (redesign for this is already underway)
+  if (!inplace)
+    raft::linalg::unaryOp(
+      dots, dots, D, [init] __device__(OutType a) { return init; }, stream);
+
+  // Arbitrary numbers for now, probably need to tune
+  const dim3 thrds(32, 16);
+  IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
+  elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
+  const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
+                   raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
+  const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
+
+  ///@todo: this complication should go away once we have eliminated the need
+  /// for atomics in stridedKernel (redesign for this is already underway)
+  if constexpr (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
+                std::is_same<InType, OutType>::value)
+    stridedSummationKernel<InType>
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
+  else
+    stridedReductionKernel<InType, OutType, IdxType>
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
+                                            reduce_op);
+
+  ///@todo: this complication should go away once we have eliminated the need
+  /// for atomics in stridedKernel (redesign for this is already underway)
+  // Perform final op on output data
+  if (!std::is_same<FinalLambda, raft::Nop<OutType>>::value)
+    raft::linalg::unaryOp(dots, dots, D, final_op, stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index e6faa883de..a58888a24f 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -14,40 +14,39 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <raft/cuda_utils.cuh>
- #include <raft/linalg/binary_op.hpp>
- #include <raft/linalg/unary_op.hpp>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
- 
- template <class math_t, typename IdxType>
- __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                            const math_t *singleScalarDev,
-                                            IdxType len) {
-   //TODO: kernel do not use shared memory in current implementation
-   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-   if (i < len) {
-     outDev[i] = inDev[i] - *singleScalarDev;
-   }
- }
- 
- template <typename math_t, typename IdxType = int, int TPB = 256>
- void subtractDevScalar(math_t *outDev, const math_t *inDev,
-                        const math_t *singleScalarDev, IdxType len,
-                        cudaStream_t stream) {
-   // Just for the note - there is no way to express such operation with cuBLAS in effective way
-   // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
-   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
-   subtract_dev_scalar_kernel<math_t>
-     <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-   CUDA_CHECK(cudaPeekAtLastError());
- }
- 
- };  // end namespace detail
- };  // end namespace linalg
- };  // end namespace raft
- 
\ No newline at end of file
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <class math_t, typename IdxType>
+__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
+                                           const math_t *singleScalarDev,
+                                           IdxType len) {
+  //TODO: kernel do not use shared memory in current implementation
+  int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) {
+    outDev[i] = inDev[i] - *singleScalarDev;
+  }
+}
+
+template <typename math_t, typename IdxType = int, int TPB = 256>
+void subtractDevScalar(math_t *outDev, const math_t *inDev,
+                       const math_t *singleScalarDev, IdxType len,
+                       cudaStream_t stream) {
+  // Just for the note - there is no way to express such operation with cuBLAS in effective way
+  // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
+  const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
+  subtract_dev_scalar_kernel<math_t>
+    <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 5e2ace1ad2..60ac47c501 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -14,84 +14,84 @@
  * limitations under the License.
  */
 
- #pragma once
-
- #include <raft/cudart_utils.h>
- #include <raft/linalg/cublas_wrappers.h>
- #include <raft/linalg/cusolver_wrappers.h>
- #include <raft/cuda_utils.cuh>
- #include <raft/handle.hpp>
- #include <raft/matrix/math.hpp>
- #include <raft/matrix/matrix.hpp>
- #include <rmm/device_scalar.hpp>
- #include <rmm/device_uvector.hpp>
- #include <raft/linalg/eig.hpp>
- #include <raft/linalg/gemm.hpp>
- #include <raft/linalg/transpose.h>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
- 
- template <typename T>
- void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
-            T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
-            bool trans_right, bool gen_left_vec, bool gen_right_vec,
-            cudaStream_t stream) {
-   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-   cublasHandle_t cublasH = handle.get_cublas_handle();
- 
- #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
-   // 46340: sqrt of max int value
-   ASSERT(n_rows <= 46340,
-          "svd solver is not supported for the data that has more than 46340 "
-          "samples (rows) "
-          "if you are using CUDA version <11. Please use other solvers such as "
-          "eig if it is available.");
- #endif
- 
-   const int m = n_rows;
-   const int n = n_cols;
- 
-   rmm::device_scalar<int> devInfo(stream);
-   T *d_rwork = nullptr;
- 
-   int lwork = 0;
-   CUSOLVER_CHECK(
-     cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
-   rmm::device_uvector<T> d_work(lwork, stream);
- 
-   char jobu = 'S';
-   char jobvt = 'A';
- 
-   if (!gen_left_vec) {
-     char new_u = 'N';
-     strcpy(&jobu, &new_u);
-   }
- 
-   if (!gen_right_vec) {
-     char new_vt = 'N';
-     strcpy(&jobvt, &new_vt);
-   }
- 
-   CUSOLVER_CHECK(cusolverDngesvd(
-     cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
-     right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
- 
-   // Transpose the right singular vector back
-   if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
- 
-   CUDA_CHECK(cudaGetLastError());
- 
-   int dev_info;
-   raft::update_host(&dev_info, devInfo.data(), 1, stream);
-   CUDA_CHECK(cudaStreamSynchronize(stream));
-   ASSERT(dev_info == 0,
-          "svd.cuh: svd couldn't converge to a solution. "
-          "This usually occurs when some of the features do not vary enough.");
- }
-
- template <typename T>
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/transpose.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename T>
+void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
+           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
+           bool trans_right, bool gen_left_vec, bool gen_right_vec,
+           cudaStream_t stream) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+
+#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
+  // 46340: sqrt of max int value
+  ASSERT(n_rows <= 46340,
+         "svd solver is not supported for the data that has more than 46340 "
+         "samples (rows) "
+         "if you are using CUDA version <11. Please use other solvers such as "
+         "eig if it is available.");
+#endif
+
+  const int m = n_rows;
+  const int n = n_cols;
+
+  rmm::device_scalar<int> devInfo(stream);
+  T *d_rwork = nullptr;
+
+  int lwork = 0;
+  CUSOLVER_CHECK(
+    cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  rmm::device_uvector<T> d_work(lwork, stream);
+
+  char jobu = 'S';
+  char jobvt = 'A';
+
+  if (!gen_left_vec) {
+    char new_u = 'N';
+    strcpy(&jobu, &new_u);
+  }
+
+  if (!gen_right_vec) {
+    char new_vt = 'N';
+    strcpy(&jobvt, &new_vt);
+  }
+
+  CUSOLVER_CHECK(cusolverDngesvd(
+    cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
+    right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
+
+  // Transpose the right singular vector back
+  if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
+
+  CUDA_CHECK(cudaGetLastError());
+
+  int dev_info;
+  raft::update_host(&dev_info, devInfo.data(), 1, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  ASSERT(dev_info == 0,
+         "svd.cuh: svd couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+}
+
+template <typename T>
 void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
             T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
@@ -106,7 +106,8 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
                      n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
                      stream);
 
-  raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
+  raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S,
+                      stream);
 
   raft::matrix::colReverse(V, n_cols, n_cols, stream);
   raft::matrix::rowReverse(S, n_cols, 1, stream);
@@ -121,82 +122,81 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
   }
 }
 
- template <typename math_t>
- void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-                math_t *sing_vals, math_t *left_sing_vecs,
-                math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
-                math_t tol, int max_sweeps, cudaStream_t stream) {
-   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
- 
-   gesvdjInfo_t gesvdj_params = NULL;
- 
-   CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params));
-   CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
-   CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps));
- 
-   int m = n_rows;
-   int n = n_cols;
- 
-   rmm::device_scalar<int> devInfo(stream);
- 
-   int lwork = 0;
-   int econ = 1;
- 
-   CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
-     cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-     left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
- 
-   rmm::device_uvector<math_t> d_work(lwork, stream);
- 
-   CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
-     cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-     left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
-     gesvdj_params, stream));
- 
-   CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
- }
- 
- template <typename math_t>
- bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
-                          math_t *S_vec, math_t *V, int n_rows, int n_cols,
-                          int k, math_t tol, cudaStream_t stream) {
-   cublasHandle_t cublasH = handle.get_cublas_handle();
- 
-   int m = n_rows, n = n_cols;
- 
-   // form product matrix
-   rmm::device_uvector<math_t> P_d(m * n, stream);
-   rmm::device_uvector<math_t> S_mat(k * k, stream);
-   CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
-   CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
- 
-   raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream);
-   svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream);
- 
-   // get norms of each
-   math_t normA = raft::matrix::getL2Norm(handle, A_d, m * n, stream);
-   math_t normU = raft::matrix::getL2Norm(handle, U, m * k, stream);
-   math_t normS = raft::matrix::getL2Norm(handle, S_mat.data(), k * k, stream);
-   math_t normV = raft::matrix::getL2Norm(handle, V, n * k, stream);
-   math_t normP = raft::matrix::getL2Norm(handle, P_d.data(), m * n, stream);
- 
-   // calculate percent error
-   const math_t alpha = 1.0, beta = -1.0;
-   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
-   CUDA_CHECK(
-     cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
- 
-   CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
-                                         &alpha, A_d, m, &beta, P_d.data(), m,
-                                         A_minus_P.data(), m, stream));
- 
-   math_t norm_A_minus_P =
-     raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
-   math_t percent_error = 100.0 * norm_A_minus_P / normA;
-   return (percent_error / 100.0 < tol);
- }
- 
- };  // end namespace detail
- };  // end namespace linalg
- };  // end namespace raft
- 
\ No newline at end of file
+template <typename math_t>
+void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
+               math_t *sing_vals, math_t *left_sing_vecs,
+               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
+               math_t tol, int max_sweeps, cudaStream_t stream) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  gesvdjInfo_t gesvdj_params = NULL;
+
+  CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
+  CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps));
+
+  int m = n_rows;
+  int n = n_cols;
+
+  rmm::device_scalar<int> devInfo(stream);
+
+  int lwork = 0;
+  int econ = 1;
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
+    left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
+    left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
+    gesvdj_params, stream));
+
+  CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+}
+
+template <typename math_t>
+bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
+                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
+                         int k, math_t tol, cudaStream_t stream) {
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+
+  int m = n_rows, n = n_cols;
+
+  // form product matrix
+  rmm::device_uvector<math_t> P_d(m * n, stream);
+  rmm::device_uvector<math_t> S_mat(k * k, stream);
+  CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
+
+  raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream);
+  svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream);
+
+  // get norms of each
+  math_t normA = raft::matrix::getL2Norm(handle, A_d, m * n, stream);
+  math_t normU = raft::matrix::getL2Norm(handle, U, m * k, stream);
+  math_t normS = raft::matrix::getL2Norm(handle, S_mat.data(), k * k, stream);
+  math_t normV = raft::matrix::getL2Norm(handle, V, n * k, stream);
+  math_t normP = raft::matrix::getL2Norm(handle, P_d.data(), m * n, stream);
+
+  // calculate percent error
+  const math_t alpha = 1.0, beta = -1.0;
+  rmm::device_uvector<math_t> A_minus_P(m * n, stream);
+  CUDA_CHECK(
+    cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
+                                        &alpha, A_d, m, &beta, P_d.data(), m,
+                                        A_minus_P.data(), m, stream));
+
+  math_t norm_A_minus_P =
+    raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
+  math_t percent_error = 100.0 * norm_A_minus_P / normA;
+  return (percent_error / 100.0 < tol);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
index 8502dc26f4..190205fea0 100644
--- a/cpp/include/raft/linalg/detail/unary_op.cuh
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -14,101 +14,100 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <raft/cudart_utils.h>
- #include <raft/cuda_utils.cuh>
- #include <raft/vectorized.cuh>
- 
- namespace raft {
- namespace linalg {
- namespace detail {
- 
- template <typename InType, int VecLen, typename Lambda, typename OutType,
-           typename IdxType>
- __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
-                               Lambda op) {
-   typedef TxN_t<InType, VecLen> InVecType;
-   typedef TxN_t<OutType, VecLen> OutVecType;
-   InVecType a;
-   OutVecType b;
-   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-   idx *= InVecType::Ratio;
-   if (idx >= len) return;
-   a.load(in, idx);
- #pragma unroll
-   for (int i = 0; i < InVecType::Ratio; ++i) {
-     b.val.data[i] = op(a.val.data[i]);
-   }
-   b.store(out, idx);
- }
- 
- template <typename InType, int VecLen, typename Lambda, typename OutType,
-           typename IdxType, int TPB>
- void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
-                  cudaStream_t stream) {
-   const IdxType nblks =
-     raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
-   unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
-     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
-   CUDA_CHECK(cudaPeekAtLastError());
- }
- 
- template <typename InType, typename Lambda, typename IdxType = int,
-           typename OutType = InType, int TPB = 256>
- void unaryOpCaller(OutType *out, const InType *in, IdxType len, Lambda op,
-              cudaStream_t stream) {
-   if (len <= 0) return;  //silently skip in case of 0 length input
-   constexpr auto maxSize =
-     sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-   size_t bytes = len * maxSize;
-   uint64_t inAddr = uint64_t(in);
-   uint64_t outAddr = uint64_t(out);
-   if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
-       outAddr % 16 == 0) {
-     unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
-       out, in, len, op, stream);
-   } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
-              outAddr % 8 == 0) {
-     unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
-       out, in, len, op, stream);
-   } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
-              outAddr % 4 == 0) {
-     unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
-       out, in, len, op, stream);
-   } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
-              outAddr % 2 == 0) {
-     unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
-       out, in, len, op, stream);
-   } else if (1 / maxSize) {
-     unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
-       out, in, len, op, stream);
-   } else {
-     unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
-                                                           stream);
-   }
- }
- 
- template <typename OutType, typename Lambda, typename IdxType>
- __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
-   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-   if (idx < len) {
-     op(out + idx, idx);
-   }
- }
- 
- template <typename OutType, typename Lambda, typename IdxType = int,
-           int TPB = 256>
- void writeOnlyUnaryOpCaller(OutType *out, IdxType len, Lambda op,
-                       cudaStream_t stream) {
-   if (len <= 0) return;  // silently skip in case of 0 length input
-   auto nblks = raft::ceildiv<IdxType>(len, TPB);
-   writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
-     <<<nblks, TPB, 0, stream>>>(out, len, op);
-   CUDA_CHECK(cudaGetLastError());
- }
- 
- };  // end namespace detail
- };  // end namespace linalg
- };  // end namespace raft
- 
\ No newline at end of file
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType>
+__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
+                              Lambda op) {
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a;
+  OutVecType b;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    b.val.data[i] = op(a.val.data[i]);
+  }
+  b.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType, int TPB>
+void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
+                 cudaStream_t stream) {
+  const IdxType nblks =
+    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, len, op);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+template <typename InType, typename Lambda, typename IdxType = int,
+          typename OutType = InType, int TPB = 256>
+void unaryOpCaller(OutType *out, const InType *in, IdxType len, Lambda op,
+                   cudaStream_t stream) {
+  if (len <= 0) return;  //silently skip in case of 0 length input
+  constexpr auto maxSize =
+    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  uint64_t inAddr = uint64_t(in);
+  uint64_t outAddr = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
+      outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
+             outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
+             outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
+             outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (1 / maxSize) {
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else {
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
+                                                          stream);
+  }
+}
+
+template <typename OutType, typename Lambda, typename IdxType>
+__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  if (idx < len) {
+    op(out + idx, idx);
+  }
+}
+
+template <typename OutType, typename Lambda, typename IdxType = int,
+          int TPB = 256>
+void writeOnlyUnaryOpCaller(OutType *out, IdxType len, Lambda op,
+                            cudaStream_t stream) {
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  auto nblks = raft::ceildiv<IdxType>(len, TPB);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, len, op);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 56d01be990..e4eead777c 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -38,9 +38,7 @@ using detail::divides_scalar;
 template <typename math_t, typename IdxType = int>
 void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
                   cudaStream_t stream) {
-  unaryOp(
-    out, in, len, divides_scalar<math_t>(scalar),
-    stream);
+  unaryOp(out, in, len, divides_scalar<math_t>(scalar), stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 2659f3d6b8..288b43f27f 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -41,9 +41,9 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
   detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
 }
 
+using detail::COPY_INPUT;
 using detail::EigVecMemUsage;
 using detail::OVERWRITE_INPUT;
-using detail::COPY_INPUT;
 
 #if CUDART_VERSION >= 10010
 
@@ -65,7 +65,8 @@ template <typename math_t>
 void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
               int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
               EigVecMemUsage memUsage, cudaStream_t stream) {
-  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors,
+                   eig_vals, memUsage, stream);
 }
 
 #endif
@@ -88,7 +89,8 @@ template <typename math_t>
 void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
                int n_cols, math_t *eig_vectors, math_t *eig_vals,
                cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream,
+                    tol, sweeps);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 90bf608e11..63b824e6f7 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -40,9 +40,8 @@ using detail::adds_scalar;
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
                cudaStream_t stream) {
-  raft::linalg::unaryOp(
-    out, in, len, adds_scalar<InType, OutType>(scalar),
-    stream);
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar),
+                        stream);
 }
 
 using detail::multiplies_scalar;
@@ -50,9 +49,8 @@ using detail::multiplies_scalar;
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
                     cudaStream_t stream) {
-  raft::linalg::unaryOp(
-    out, in, len, multiplies_scalar<InType, OutType>(scalar),
-    stream);
+  raft::linalg::unaryOp(out, in, len,
+                        multiplies_scalar<InType, OutType>(scalar), stream);
 }
 /** @} */
 
@@ -70,33 +68,25 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
                 cudaStream_t stream) {
-  binaryOp(
-    out, in1, in2, len, thrust::plus<InType>(),
-    stream);
+  binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
                 cudaStream_t stream) {
-  binaryOp(
-    out, in1, in2, len, thrust::minus<InType>(),
-    stream);
+  binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
                      IdxType len, cudaStream_t stream) {
-  binaryOp(
-    out, in1, in2, len, thrust::multiplies<InType>(),
-    stream);
+  binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
                    IdxType len, cudaStream_t stream) {
-  binaryOp(
-    out, in1, in2, len, thrust::divides<InType>(),
-    stream);
+  binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
 }
 
 using detail::divides_check_zero;
@@ -104,10 +94,7 @@ using detail::divides_check_zero;
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
                             IdxType len, cudaStream_t stream) {
-  binaryOp(
-    out, in1, in2, len,
-    divides_check_zero<InType, OutType>(),
-    stream);
+  binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 3e8ac5b768..9326714a41 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -44,7 +44,8 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
           int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
           cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
           math_t beta, cudaStream_t stream) {
-  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
+               trans_b, alpha, beta, stream);
 }
 
 template <typename math_t>
@@ -81,7 +82,8 @@ template <typename T>
 void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
           int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
           cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
-  detail::gemm(handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
+  detail::gemm(handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor,
+               isYColMajor, stream, alpha, beta);
 }
 
 }  // end namespace linalg
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 2fa8d1dd0d..28c4ff8238 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -83,8 +83,10 @@ int computeSmallestEigenvectors(
   value_type_t *__restrict__ lanczosVecs_dev,
   value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
   value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
-  return raft::detail::computeSmallestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, totalIter, shift,
-    alpha_host, beta_host, lanczosVecs_dev, work_dev, eigVals_dev, eigVecs_dev, seed);
+  return raft::detail::computeSmallestEigenvectors(
+    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter,
+    totalIter, shift, alpha_host, beta_host, lanczosVecs_dev, work_dev,
+    eigVals_dev, eigVecs_dev, seed);
 }
 
 /**  
@@ -131,7 +133,9 @@ int computeSmallestEigenvectors(
   value_type_t tol, bool reorthogonalize, index_type_t &iter,
   value_type_t *__restrict__ eigVals_dev,
   value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
-  return raft::detail::computeSmallestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, eigVals_dev, eigVecs_dev, seed);
+  return raft::detail::computeSmallestEigenvectors(
+    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter,
+    eigVals_dev, eigVecs_dev, seed);
 }
 
 // =========================================================
@@ -192,8 +196,10 @@ int computeLargestEigenvectors(
   value_type_t *__restrict__ lanczosVecs_dev,
   value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
   value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
-  return raft::detail::computeLargestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter, totalIter, alpha_host, beta_host,
-    lanczosVecs_dev, work_dev, eigVals_dev, eigVecs_dev, seed);
+  return raft::detail::computeLargestEigenvectors(
+    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter,
+    totalIter, alpha_host, beta_host, lanczosVecs_dev, work_dev, eigVals_dev,
+    eigVecs_dev, seed);
 }
 
 /**  
@@ -240,7 +246,9 @@ int computeLargestEigenvectors(
   value_type_t tol, bool reorthogonalize, index_type_t &iter,
   value_type_t *__restrict__ eigVals_dev,
   value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
-  return raft::detail::computeLargestEigenvectors(handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter, eigVals_dev, eigVecs_dev, seed);
+  return raft::detail::computeLargestEigenvectors(
+    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter,
+    eigVals_dev, eigVecs_dev, seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index 0c9a2d0b10..40e6253af9 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -40,8 +40,8 @@ template <typename InType, typename MapOp, int TPB = 256, typename... Args,
           typename OutType = InType>
 void map(OutType *out, size_t len, MapOp map, cudaStream_t stream,
          const InType *in, Args... args) {
-  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in,
-                                                args...);
+  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream,
+                                                        in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 6cd58a43dc..75baf86e1c 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -39,8 +39,9 @@ template <typename InType, typename MapOp, int TPB = 256, typename... Args,
           typename OutType = InType>
 void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
                       const InType *in, Args... args) {
-  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
-    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
+  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB,
+                            Args...>(out, len, (OutType)0, map,
+                                     detail::sum_tag(), stream, in, args...);
 }
 
 /**
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index 7ef02735ae..2cfaa0564c 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -48,7 +48,8 @@ template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
 void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
                     IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
                     cudaStream_t stream) {
-  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op,
+                         stream);
 }
 
 /**
@@ -79,7 +80,8 @@ template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
 void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
                     const Type *vec2, IdxType D, IdxType N, bool rowMajor,
                     bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor,
+                         bcastAlongRows, op, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 588169c580..3935e648dc 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -57,7 +57,8 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
                       MainLambda main_op = raft::Nop<InType, IdxType>(),
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
                       FinalLambda final_op = raft::Nop<OutType>()) {
-  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
+                           reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 7f1651cc81..970c339090 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -44,7 +44,9 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
            T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
            bool trans_right, bool gen_left_vec, bool gen_right_vec,
            cudaStream_t stream) {
- detail::svdQR(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, right_sing_vecs, trans_right, gen_left_vec, gen_right_vec, stream);
+  detail::svdQR(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs,
+                right_sing_vecs, trans_right, gen_left_vec, gen_right_vec,
+                stream);
 }
 
 template <typename T>
@@ -75,7 +77,9 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
                math_t *sing_vals, math_t *left_sing_vecs,
                math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
                math_t tol, int max_sweeps, cudaStream_t stream) {
-  detail::svdJacobi(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs, right_sing_vecs, gen_left_vec, gen_right_vec, tol, max_sweeps, stream);
+  detail::svdJacobi(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs,
+                    right_sing_vecs, gen_left_vec, gen_right_vec, tol,
+                    max_sweeps, stream);
 }
 
 /**
@@ -122,7 +126,8 @@ template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
                          math_t *S_vec, math_t *V, int n_rows, int n_cols,
                          int k, math_t tol, cudaStream_t stream) {
-  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
+  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols,
+                                     k, tol, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 5c525ce791..765a38f583 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -69,9 +69,9 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    raft::linalg::eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3,
-             eig_vectors.data(), eig_vals.data(),
-             EigVecMemUsage::OVERWRITE_INPUT, stream);
+    raft::linalg::eigSelDC(handle, cov_matrix.data(), params.n_row,
+                           params.n_col, 3, eig_vectors.data(), eig_vals.data(),
+                           EigVecMemUsage::OVERWRITE_INPUT, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 

From f7d43b568621b64c7e0d546ba25b33eb70a97ad6 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 17 Nov 2021 11:33:21 -0800
Subject: [PATCH 04/17] correcting include

---
 cpp/include/raft/linalg/mean_squared_error.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 9d1538c172..89d91719c7 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "map_then_reduce.cuh"
+#include "map_then_reduce.hpp"
 
 namespace raft {
 namespace linalg {

From 9c0d6551b38b05110218d9c4efa7852951a7a488 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 14 Dec 2021 12:04:50 -0800
Subject: [PATCH 05/17] removing deleted file again

---
 cpp/include/raft/sparse/selection/knn.cuh | 436 ----------------------
 1 file changed, 436 deletions(-)
 delete mode 100644 cpp/include/raft/sparse/selection/knn.cuh

diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
deleted file mode 100644
index 631a740bfb..0000000000
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/device_uvector.hpp>
-
-#include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/mr/device/buffer.hpp>
-
-#include <raft/sparse/op/slice.h>
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/csr.cuh>
-#include <raft/sparse/distance/distance.hpp>
-#include <raft/spatial/knn/knn.hpp>
-
-namespace raft {
-namespace sparse {
-namespace selection {
-
-template <typename value_idx, typename value_t>
-struct csr_batcher_t {
-  csr_batcher_t(value_idx batch_size, value_idx n_rows,
-                const value_idx *csr_indptr, const value_idx *csr_indices,
-                const value_t *csr_data)
-    : batch_start_(0),
-      batch_stop_(0),
-      batch_rows_(0),
-      total_rows_(n_rows),
-      batch_size_(batch_size),
-      csr_indptr_(csr_indptr),
-      csr_indices_(csr_indices),
-      csr_data_(csr_data),
-      batch_csr_start_offset_(0),
-      batch_csr_stop_offset_(0) {}
-
-  void set_batch(int batch_num) {
-    batch_start_ = batch_num * batch_size_;
-    batch_stop_ = batch_start_ + batch_size_ - 1;  // zero-based indexing
-
-    if (batch_stop_ >= total_rows_)
-      batch_stop_ = total_rows_ - 1;  // zero-based indexing
-
-    batch_rows_ = (batch_stop_ - batch_start_) + 1;
-  }
-
-  value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
-                                     cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_indptr(
-      batch_start_, batch_stop_, csr_indptr_, batch_indptr,
-      &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
-
-    return batch_csr_stop_offset_ - batch_csr_start_offset_;
-  }
-
-  void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
-                                  cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_populate(
-      batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
-      csr_indices, csr_data, stream);
-  }
-
-  value_idx batch_rows() const { return batch_rows_; }
-
-  value_idx batch_start() const { return batch_start_; }
-
-  value_idx batch_stop() const { return batch_stop_; }
-
- private:
-  value_idx batch_size_;
-  value_idx batch_start_;
-  value_idx batch_stop_;
-  value_idx batch_rows_;
-
-  value_idx total_rows_;
-
-  const value_idx *csr_indptr_;
-  const value_idx *csr_indices_;
-  const value_t *csr_data_;
-
-  value_idx batch_csr_start_offset_;
-  value_idx batch_csr_stop_offset_;
-};
-
-template <typename value_idx, typename value_t>
-class sparse_knn_t {
- public:
-  sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_,
-               const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_,
-               int n_idx_cols_, const value_idx *queryIndptr_,
-               const value_idx *queryIndices_, const value_t *queryData_,
-               size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
-               value_idx *output_indices_, value_t *output_dists_, int k_,
-               const raft::handle_t &handle_,
-               size_t batch_size_index_ = 2 << 14,  // approx 1M
-               size_t batch_size_query_ = 2 << 14,
-               raft::distance::DistanceType metric_ =
-                 raft::distance::DistanceType::L2Expanded,
-               float metricArg_ = 0)
-    : idxIndptr(idxIndptr_),
-      idxIndices(idxIndices_),
-      idxData(idxData_),
-      idxNNZ(idxNNZ_),
-      n_idx_rows(n_idx_rows_),
-      n_idx_cols(n_idx_cols_),
-      queryIndptr(queryIndptr_),
-      queryIndices(queryIndices_),
-      queryData(queryData_),
-      queryNNZ(queryNNZ_),
-      n_query_rows(n_query_rows_),
-      n_query_cols(n_query_cols_),
-      output_indices(output_indices_),
-      output_dists(output_dists_),
-      k(k_),
-      handle(handle_),
-      batch_size_index(batch_size_index_),
-      batch_size_query(batch_size_query_),
-      metric(metric_),
-      metricArg(metricArg_) {}
-
-  void run() {
-    using namespace raft::sparse;
-
-    int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
-    csr_batcher_t<value_idx, value_t> query_batcher(
-      batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData);
-
-    size_t rows_processed = 0;
-
-    for (int i = 0; i < n_batches_query; i++) {
-      /**
-        * Compute index batch info
-        */
-      query_batcher.set_batch(i);
-
-      /**
-        * Slice CSR to rows in batch
-        */
-
-      rmm::device_uvector<value_idx> query_batch_indptr(
-        query_batcher.batch_rows() + 1, handle.get_stream());
-
-      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
-        query_batch_indptr.data(), handle.get_stream());
-
-      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
-                                                         handle.get_stream());
-      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
-                                                    handle.get_stream());
-
-      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
-                                               query_batch_data.data(),
-                                               handle.get_stream());
-
-      // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
-      // batches and 1 space for the results of the merge, which get copied back to the top
-      rmm::device_uvector<value_idx> merge_buffer_indices(0,
-                                                          handle.get_stream());
-      rmm::device_uvector<value_t> merge_buffer_dists(0, handle.get_stream());
-
-      value_t *dists_merge_buffer_ptr;
-      value_idx *indices_merge_buffer_ptr;
-
-      int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
-      csr_batcher_t<value_idx, value_t> idx_batcher(
-        batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData);
-
-      for (int j = 0; j < n_batches_idx; j++) {
-        idx_batcher.set_batch(j);
-
-        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
-                                    handle.get_stream());
-        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
-                                  handle.get_stream());
-
-        /**
-          * Slice CSR to rows in batch
-        */
-        rmm::device_uvector<value_idx> idx_batch_indptr(
-          idx_batcher.batch_rows() + 1, handle.get_stream());
-        rmm::device_uvector<value_idx> idx_batch_indices(0,
-                                                         handle.get_stream());
-        rmm::device_uvector<value_t> idx_batch_data(0, handle.get_stream());
-
-        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
-          idx_batch_indptr.data(), handle.get_stream());
-
-        idx_batch_indices.resize(idx_batch_nnz, handle.get_stream());
-        idx_batch_data.resize(idx_batch_nnz, handle.get_stream());
-
-        idx_batcher.get_batch_csr_indices_data(
-          idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream());
-
-        /**
-           * Compute distances
-           */
-        size_t dense_size =
-          idx_batcher.batch_rows() * query_batcher.batch_rows();
-        rmm::device_uvector<value_t> batch_dists(dense_size,
-                                                 handle.get_stream());
-
-        CUDA_CHECK(cudaMemset(batch_dists.data(), 0,
-                              batch_dists.size() * sizeof(value_t)));
-
-        compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
-                          n_query_batch_nnz, idx_batch_indptr.data(),
-                          idx_batch_indices.data(), idx_batch_data.data(),
-                          query_batch_indptr.data(), query_batch_indices.data(),
-                          query_batch_data.data(), batch_dists.data());
-
-        // Build batch indices array
-        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
-                                                     handle.get_stream());
-
-        // populate batch indices array
-        value_idx batch_rows = query_batcher.batch_rows(),
-                  batch_cols = idx_batcher.batch_rows();
-
-        iota_fill(batch_indices.data(), batch_rows, batch_cols,
-                  handle.get_stream());
-
-        /**
-         * Perform k-selection on batch & merge with other k-selections
-         */
-        size_t merge_buffer_offset = batch_rows * k;
-        dists_merge_buffer_ptr =
-          merge_buffer_dists.data() + merge_buffer_offset;
-        indices_merge_buffer_ptr =
-          merge_buffer_indices.data() + merge_buffer_offset;
-
-        perform_k_selection(idx_batcher, query_batcher, batch_dists.data(),
-                            batch_indices.data(), dists_merge_buffer_ptr,
-                            indices_merge_buffer_ptr);
-
-        value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr;
-        value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
-
-        // Merge results of difference batches if necessary
-        if (idx_batcher.batch_start() > 0) {
-          size_t merge_buffer_tmp_out = batch_rows * k * 2;
-          dists_merge_buffer_tmp_ptr =
-            merge_buffer_dists.data() + merge_buffer_tmp_out;
-          indices_merge_buffer_tmp_ptr =
-            merge_buffer_indices.data() + merge_buffer_tmp_out;
-
-          merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(),
-                        merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr,
-                        indices_merge_buffer_tmp_ptr);
-        }
-
-        // copy merged output back into merge buffer partition for next iteration
-        raft::copy_async<value_idx>(merge_buffer_indices.data(),
-                                    indices_merge_buffer_tmp_ptr,
-                                    batch_rows * k, handle.get_stream());
-        raft::copy_async<value_t>(merge_buffer_dists.data(),
-                                  dists_merge_buffer_tmp_ptr, batch_rows * k,
-                                  handle.get_stream());
-      }
-
-      // Copy final merged batch to output array
-      raft::copy_async<value_idx>(
-        output_indices + (rows_processed * k), merge_buffer_indices.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
-      raft::copy_async<value_t>(
-        output_dists + (rows_processed * k), merge_buffer_dists.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
-
-      rows_processed += query_batcher.batch_rows();
-    }
-  }
-
- private:
-  void merge_batches(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                     csr_batcher_t<value_idx, value_t> &query_batcher,
-                     value_t *merge_buffer_dists,
-                     value_idx *merge_buffer_indices, value_t *out_dists,
-                     value_idx *out_indices) {
-    // build translation buffer to shift resulting indices by the batch
-    std::vector<value_idx> id_ranges;
-    id_ranges.push_back(0);
-    id_ranges.push_back(idx_batcher.batch_start());
-
-    rmm::device_uvector<value_idx> trans(id_ranges.size(), handle.get_stream());
-    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
-                        handle.get_stream());
-
-    // combine merge buffers only if there's more than 1 partition to combine
-    raft::spatial::knn::knn_merge_parts(
-      merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
-      query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data());
-  }
-
-  void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
-                           csr_batcher_t<value_idx, value_t> query_batcher,
-                           value_t *batch_dists, value_idx *batch_indices,
-                           value_t *out_dists, value_idx *out_indices) {
-    // populate batch indices array
-    value_idx batch_rows = query_batcher.batch_rows(),
-              batch_cols = idx_batcher.batch_rows();
-
-    // build translation buffer to shift resulting indices by the batch
-    std::vector<value_idx> id_ranges;
-    id_ranges.push_back(0);
-    id_ranges.push_back(idx_batcher.batch_start());
-
-    // in the case where the number of idx rows in the batch is < k, we
-    // want to adjust k.
-    value_idx n_neighbors = min(k, batch_cols);
-
-    bool ascending = true;
-    if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
-
-    // kernel to slice first (min) k cols and copy into batched merge buffer
-    raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows,
-                                 batch_cols, out_dists, out_indices, ascending,
-                                 n_neighbors, handle.get_stream());
-  }
-
-  void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                         csr_batcher_t<value_idx, value_t> &query_batcher,
-                         size_t idx_batch_nnz, size_t query_batch_nnz,
-                         value_idx *idx_batch_indptr,
-                         value_idx *idx_batch_indices, value_t *idx_batch_data,
-                         value_idx *query_batch_indptr,
-                         value_idx *query_batch_indices,
-                         value_t *query_batch_data, value_t *batch_dists) {
-    /**
-     * Compute distances
-     */
-    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(
-      handle);
-    dist_config.b_nrows = idx_batcher.batch_rows();
-    dist_config.b_ncols = n_idx_cols;
-    dist_config.b_nnz = idx_batch_nnz;
-
-    dist_config.b_indptr = idx_batch_indptr;
-    dist_config.b_indices = idx_batch_indices;
-    dist_config.b_data = idx_batch_data;
-
-    dist_config.a_nrows = query_batcher.batch_rows();
-    dist_config.a_ncols = n_query_cols;
-    dist_config.a_nnz = query_batch_nnz;
-
-    dist_config.a_indptr = query_batch_indptr;
-    dist_config.a_indices = query_batch_indices;
-    dist_config.a_data = query_batch_data;
-
-    if (raft::sparse::distance::supportedDistance.find(metric) ==
-        raft::sparse::distance::supportedDistance.end())
-      THROW("DistanceType not supported: %d", metric);
-
-    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric,
-                                             metricArg);
-  }
-
-  const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
-  value_idx *output_indices;
-  const value_t *idxData, *queryData;
-  value_t *output_dists;
-
-  size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
-
-  raft::distance::DistanceType metric;
-
-  float metricArg;
-
-  int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
-
-  const raft::handle_t &handle;
-};
-
-/**
-   * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
-   * using some distance implementation
-   * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
-   * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
-   * @param[in] idxData csr data array of the index matrix (size idxNNZ)
-   * @param[in] idxNNA number of non-zeros for sparse index matrix
-   * @param[in] n_idx_rows number of data samples in index matrix
-   * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
-   * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
-   * @param[in] queryData csr data array of the query matrix (size queryNNZ)
-   * @param[in] queryNNZ number of non-zeros for sparse query matrix
-   * @param[in] n_query_rows number of data samples in query matrix
-   * @param[in] n_query_cols number of features in query matrix
-   * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
-   * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
-   * @param[in] k the number of neighbors to query
-   * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
-   * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
-   * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
-   * @param[in] metric distance metric/measure to use
-   * @param[in] metricArg potential argument for metric (currently unused)
-   */
-template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
-                     const value_t *idxData, size_t idxNNZ, int n_idx_rows,
-                     int n_idx_cols, const value_idx *queryIndptr,
-                     const value_idx *queryIndices, const value_t *queryData,
-                     size_t queryNNZ, int n_query_rows, int n_query_cols,
-                     value_idx *output_indices, value_t *output_dists, int k,
-                     const raft::handle_t &handle,
-                     size_t batch_size_index = 2 << 14,  // approx 1M
-                     size_t batch_size_query = 2 << 14,
-                     raft::distance::DistanceType metric =
-                       raft::distance::DistanceType::L2Expanded,
-                     float metricArg = 0) {
-  sparse_knn_t<value_idx, value_t>(
-    idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
-    queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
-    output_indices, output_dists, k, handle, batch_size_index, batch_size_query,
-    metric, metricArg)
-    .run();
-}
-
-};  // namespace selection
-};  // namespace sparse
-};  // namespace raft

From a071d09dab8ae6850d35d550cbc4e368d1f9ee70 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 14 Dec 2021 12:52:10 -0800
Subject: [PATCH 06/17] correcting merges and passing tests

---
 cpp/include/raft/linalg/add.hpp               |  41 +-
 cpp/include/raft/linalg/binary_op.hpp         |  12 +-
 .../raft/linalg/cholesky_r1_update.hpp        |  16 +-
 .../raft/linalg/coalesced_reduction.hpp       |  25 +-
 cpp/include/raft/linalg/detail/add.cuh        |  26 +-
 cpp/include/raft/linalg/detail/binary_op.cuh  |  31 -
 .../raft/linalg/detail/cholesky_r1_update.hpp |  67 +-
 .../linalg/detail/coalesced_reduction.cuh     |  32 -
 .../raft/linalg/detail/contractions.cuh       | 117 +--
 cpp/include/raft/linalg/detail/eig.hpp        |  22 +-
 cpp/include/raft/linalg/detail/functional.cuh |  15 +-
 cpp/include/raft/linalg/detail/gemm.hpp       |  41 -
 cpp/include/raft/linalg/detail/lanczos.hpp    | 778 ++++++++++++------
 cpp/include/raft/linalg/detail/map.cuh        |  19 +-
 .../raft/linalg/detail/map_then_reduce.cuh    |  90 +-
 .../raft/linalg/detail/strided_reduction.cuh  |  32 -
 cpp/include/raft/linalg/detail/subtract.cuh   |  23 +-
 cpp/include/raft/linalg/detail/svd.cuh        |  64 +-
 cpp/include/raft/linalg/detail/unary_op.cuh   |  72 +-
 cpp/include/raft/linalg/divide.hpp            |   4 +-
 cpp/include/raft/linalg/eig.hpp               |  43 +-
 cpp/include/raft/linalg/eltwise.hpp           |  39 +-
 cpp/include/raft/linalg/gemm.hpp              |  67 +-
 cpp/include/raft/linalg/lanczos.hpp           | 158 +++-
 cpp/include/raft/linalg/map.hpp               |   7 +-
 cpp/include/raft/linalg/map_then_reduce.hpp   |  30 +-
 cpp/include/raft/linalg/matrix_vector_op.hpp  |  33 +-
 cpp/include/raft/linalg/qr.hpp                |  19 +-
 cpp/include/raft/linalg/strided_reduction.hpp |  25 +-
 cpp/include/raft/linalg/subtract.hpp          |  12 +-
 cpp/include/raft/linalg/svd.hpp               | 121 ++-
 cpp/include/raft/linalg/unary_op.hpp          |  18 +-
 .../sparse/distance/detail/l2_distance.cuh    |   5 -
 .../raft/sparse/selection/detail/knn.cuh      |   2 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |   2 +-
 cpp/test/linalg/eig_sel.cu                    |  12 +-
 36 files changed, 1172 insertions(+), 948 deletions(-)

diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 452cb00051..2a59339c20 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -42,16 +42,9 @@ using detail::adds_scalar;
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-<<<<<<< HEAD:cpp/include/raft/linalg/add.hpp
-void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-               cudaStream_t stream) {
-  unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
-=======
 void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/add.cuh
+  unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
 }
 
 /**
@@ -68,26 +61,9 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-<<<<<<< HEAD:cpp/include/raft/linalg/add.hpp
-void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
-         cudaStream_t stream) {
-  binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
-=======
 void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
-}
-
-template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t* outDev,
-                                      const math_t* inDev,
-                                      const math_t* singleScalarDev,
-                                      IdxType len)
-{
-  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/add.cuh
+  binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
@@ -101,24 +77,13 @@ __global__ void add_dev_scalar_kernel(math_t* outDev,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-<<<<<<< HEAD:cpp/include/raft/linalg/add.hpp
-void addDevScalar(math_t *outDev, const math_t *inDev,
-                  const math_t *singleScalarDev, IdxType len,
-                  cudaStream_t stream) {
-  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
-=======
 void addDevScalar(math_t* outDev,
                   const math_t* inDev,
                   const math_t* singleScalarDev,
                   IdxType len,
                   cudaStream_t stream)
 {
-  // TODO: block dimension has not been tuned
-  dim3 block(256);
-  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/add.cuh
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 1e03a1d231..e482240b59 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -39,10 +39,14 @@ namespace linalg {
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val1, const InType& val2);`
  */
-template <typename InType, typename Lambda, typename OutType = InType,
-          typename IdxType = int, int TPB = 256>
-void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
-              Lambda op, cudaStream_t stream) {
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
   detail::binaryOp(out, in1, in2, len, op, stream);
 }
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index ed0307cd2b..2428972d85 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -118,11 +118,17 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
-                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
-                         cudaStream_t stream, math_t eps = -1) {
-  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo,
-                              stream, eps);
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
 }
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index ad5279b1ad..a8f19f61b1 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -48,17 +48,24 @@ namespace linalg {
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void coalescedReduction(OutType *dots, const InType *data, int D, int N,
-                        OutType init, cudaStream_t stream, bool inplace = false,
-                        MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op = raft::Nop<OutType>()) {
-  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                             reduce_op, final_op);
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index be7b8bb299..8459f7924d 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -23,25 +23,27 @@ namespace linalg {
 namespace detail {
 
 template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                      const math_t *singleScalarDev,
-                                      IdxType len) {
+__global__ void add_dev_scalar_kernel(math_t* outDev,
+                                      const math_t* inDev,
+                                      const math_t* singleScalarDev,
+                                      IdxType len)
+{
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] + *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
 }
 
 template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t *outDev, const math_t *inDev,
-                  const math_t *singleScalarDev, IdxType len,
-                  cudaStream_t stream) {
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
   // TODO: block dimension has not been tuned
   dim3 block(256);
   dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t>
-    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
index de7ca96fe1..7c9ba2aeed 100644
--- a/cpp/include/raft/linalg/detail/binary_op.cuh
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -60,36 +60,6 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/binary_op.cuh
-template <typename InType, typename Lambda, typename OutType = InType,
-          typename IdxType = int, int TPB = 256>
-void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
-              Lambda op, cudaStream_t stream) {
-  constexpr auto maxSize =
-    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t in1Addr = uint64_t(in1);
-  uint64_t in2Addr = uint64_t(in2);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 &&
-      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
-=======
-/**
- * @brief perform element-wise binary operation on the input arrays
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val1, const InType& val2);`
- */
 template <typename InType,
           typename Lambda,
           typename OutType = InType,
@@ -104,7 +74,6 @@ void binaryOp(
   uint64_t in2Addr       = uint64_t(in2);
   uint64_t outAddr       = uint64_t(out);
   if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/binary_op.cuh
     binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
index 49bb190836..db00c5d6fc 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -27,9 +27,16 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
-                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
-                         cudaStream_t stream, math_t eps = -1) {
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
   // The matrix A' is defined as:
   // A' = [[A_11, A_12]
   //       [A_21, A_22]]
@@ -49,18 +56,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // We need a workspace in device memory to store a scalar. Additionally, in
   // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
   const int align = 256;
-  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
-                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
-                 : 0;
+  int offset =
+    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
   if (workspace == nullptr) {
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
-  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
-  math_t *L_22 = L + (n - 1) * ld + n - 1;
+  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
+  math_t* L_22 = L + (n - 1) * ld + n - 1;
 
-  math_t *A_new;
-  math_t *A_row;
+  math_t* A_new;
+  math_t* A_row;
   if (uplo == CUBLAS_FILL_MODE_UPPER) {
     // A_new is stored as the n-1 th column of L
     A_new = L + (n - 1) * ld;
@@ -69,30 +75,39 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
     // as the n-th row of L. Since the matrix is column major, this is non
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t *>(workspace);
-    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                          A_row, ld, A_new, 1, stream));
+    A_new = reinterpret_cast<math_t*>(workspace);
+    RAFT_CUBLAS_TRY(
+      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
   }
-  cublasOperation_t op =
-    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(
-      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
-      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
+    RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
+                                             CUBLAS_SIDE_LEFT,
+                                             uplo,
+                                             op,
+                                             CUBLAS_DIAG_NON_UNIT,
+                                             n - 1,
+                                             1,
+                                             &alpha,
+                                             L,
+                                             ld,
+                                             A_new,
+                                             n - 1,
+                                             stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
-                                         A_new, 1, A_new, 1, s, stream));
+    RAFT_CUBLAS_TRY(
+      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                            A_new, 1, A_row, ld, stream));
+      RAFT_CUBLAS_TRY(
+        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
-    CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
   }
 
   // L_22 = sqrt(A_22 - L_12 * L_12)
@@ -100,16 +115,14 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   math_t L_22_host;
   raft::update_host(&s_host, s, 1, stream);
   raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   L_22_host = std::sqrt(L_22_host - s_host);
 
   // Check for numeric error with sqrt. If the matrix is not positive definit or
   // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
   // negative, which would result L_22 = NaN. A small positive eps parameter
   // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
-    L_22_host = eps;
-  }
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
   ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
   raft::update_device(L_22, &L_22_host, 1, stream);
 }
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index 4a8660741f..bb451bf13a 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -62,42 +62,10 @@ __global__ void coalescedReductionKernel(OutType* dots,
   }
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/coalesced_reduction.cuh
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
-=======
-/**
- * @brief Compute reduction of the input matrix along the leading dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
- */
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
           typename MainLambda   = raft::Nop<InType, IdxType>,
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/coalesced_reduction.cuh
           typename ReduceLambda = raft::Sum<OutType>,
           typename FinalLambda  = raft::Nop<OutType>>
 void coalescedReduction(OutType* dots,
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index b04c813cd8..d5dd416c49 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -22,8 +22,7 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <typename DataT, typename IdxT, typename Policy,
-          bool isRowMajor = true>
+template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
 struct Contractions_NT {
  protected:
   typedef Policy P;
@@ -81,16 +80,15 @@ struct Contractions_NT {
 
  public:
   /**
-* @brief Ctor
-* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
-* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
-* @param[in] _m number of rows of X
-* @param[in] _n number of rows of Y
-* @param[in] _k number of cols of X and Y
-* @param[in] _smem shared memory region used during computations
-*/
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, char* _smem)
+   * @brief Ctor
+   * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
+   * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
+   * @param[in] _m number of rows of X
+   * @param[in] _n number of rows of Y
+   * @param[in] _k number of cols of X and Y
+   * @param[in] _smem shared memory region used during computations
+   */
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -107,19 +105,28 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {}
+      pageRd(0)
+  {
+  }
 
   /**
-* @brief Ctor
-* @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
-* @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
-* @param[in] _m number of rows of X
-* @param[in] _n number of rows of Y
-* @param[in] _k number of cols of X and Y
-* @param[in] _smem shared memory region used during computations
-*/
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
+   * @brief Ctor
+   * @param[in] _x X matrix. [on device] [dim = _m x _k] [row-major]
+   * @param[in] _y Y matrix. [on device] [dim = _n x _k] [row-major]
+   * @param[in] _m number of rows of X
+   * @param[in] _n number of rows of Y
+   * @param[in] _k number of cols of X and Y
+   * @param[in] _smem shared memory region used during computations
+   */
+  DI Contractions_NT(const DataT* _x,
+                     const DataT* _y,
+                     IdxT _m,
+                     IdxT _n,
+                     IdxT _k,
+                     IdxT _lda,
+                     IdxT _ldb,
+                     IdxT _ldd,
+                     char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -133,50 +140,55 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {
+      pageRd(0)
+  {
     if (isRowMajor) {
       xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
       yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x = _x + xrowid * lda;
-      y = _y + yrowid * ldb;
+      x      = _x + xrowid * lda;
+      y      = _y + yrowid * ldb;
     } else {
       xrowid = IdxT(blockIdx.y) * P::Mblk;
       yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x = _x + xrowid + srowid * lda;
-      y = _y + yrowid + srowid * ldb;
+      x      = _x + xrowid + srowid * lda;
+      y      = _y + yrowid + srowid * ldb;
     }
   }
 
  protected:
   /**
-* @brief Load current block of X/Y from global memory to registers
-* @param[in] kidx current start index of k to be loaded
-*/
-  DI void ldgXY(IdxT kidx) {
+   * @brief Load current block of X/Y from global memory to registers
+   * @param[in] kidx current start index of k to be loaded
+   */
+  DI void ldgXY(IdxT kidx)
+  {
     ldgX(kidx);
     ldgY(kidx);
   }
 
   /**
-* @brief Store current block of X/Y from registers to smem
-* @param[in] kidx current start index of k to be loaded
-*/
-  DI void stsXY() {
+   * @brief Store current block of X/Y from registers to smem
+   * @param[in] kidx current start index of k to be loaded
+   */
+  DI void stsXY()
+  {
     stsX(sx + pageWr * P::SmemPage);
     stsY(sy + pageWr * P::SmemPage);
   }
 
   /**
-* @brief Load X and Y block from shared memory to registers
-* @param[in] kidx k value from the current k-block to be loaded from smem
-*/
-  DI void ldsXY(int kidx) {
+   * @brief Load X and Y block from shared memory to registers
+   * @param[in] kidx k value from the current k-block to be loaded from smem
+   */
+  DI void ldsXY(int kidx)
+  {
     ldsX(kidx, sx + pageRd * P::SmemPage);
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
  private:
-  DI void ldgX(IdxT kidx) {
+  DI void ldgX(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -193,11 +205,10 @@ struct Contractions_NT {
       }
     } else {
       const auto numRows = k;
-      auto koffset = scolid;
+      auto koffset       = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThX; ++i) {
-        if ((koffset + xrowid) < lda &&
-            (srowid + kidx + i * P::LdgRowsX) < numRows) {
+        if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) {
           ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
         } else {
 #pragma unroll
@@ -209,7 +220,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx) {
+  DI void ldgY(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = n;
       auto koffset = kidx + scolid;
@@ -229,8 +241,7 @@ struct Contractions_NT {
       auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb &&
-            (srowid + kidx + i * P::LdgRowsY) < numRows) {
+        if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) {
           ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
         } else {
 #pragma unroll
@@ -242,7 +253,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsX(DataT* smem) {
+  DI void stsX(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThX; ++i) {
@@ -250,7 +262,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsY(DataT* smem) {
+  DI void stsY(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThY; ++i) {
@@ -258,7 +271,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsX(int kidx, DataT* smem) {
+  DI void ldsX(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + accrowid * P::SmemStride + kidx;
 #pragma unroll
@@ -277,7 +291,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsY(int kidx, DataT* smem) {
+  DI void ldsY(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + acccolid * P::SmemStride + kidx;
 #pragma unroll
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index 997e98dcc4..704fe339dc 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -247,34 +247,22 @@ void eigSelDC(const raft::handle_t& handle,
 #endif
 
 template <typename math_t>
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/eig.hpp
-void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
-               int n_cols, math_t *eig_vectors, math_t *eig_vals,
-               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-
-  syevjInfo_t syevj_params = nullptr;
-  CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
-=======
 void eigJacobi(const raft::handle_t& handle,
                const math_t* in,
-               std::size_t n_rows,
-               std::size_t n_cols,
+               int n_rows,
+               int n_cols,
                math_t* eig_vectors,
                math_t* eig_vals,
                cudaStream_t stream,
-               math_t tol           = 1.e-7,
-               std::uint32_t sweeps = 15)
+               math_t tol = 1.e-7,
+               int sweeps = 15)
 {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
   RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
   RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/eig.cuh
+  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
 
   int lwork;
   RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH,
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
index fec2e27228..4cebd34d08 100644
--- a/cpp/include/raft/linalg/detail/functional.cuh
+++ b/cpp/include/raft/linalg/detail/functional.cuh
@@ -27,9 +27,7 @@ struct divides_scalar {
  public:
   divides_scalar(ArgType scalar) : scalar_(scalar) {}
 
-  __host__ __device__ inline ReturnType operator()(ArgType in) {
-    return in / scalar_;
-  }
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in / scalar_; }
 
  private:
   ArgType scalar_;
@@ -40,9 +38,7 @@ struct adds_scalar {
  public:
   adds_scalar(ArgType scalar) : scalar_(scalar) {}
 
-  __host__ __device__ inline ReturnType operator()(ArgType in) {
-    return in + scalar_;
-  }
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in + scalar_; }
 
  private:
   ArgType scalar_;
@@ -53,9 +49,7 @@ struct multiplies_scalar {
  public:
   multiplies_scalar(ArgType scalar) : scalar_(scalar) {}
 
-  __host__ __device__ inline ReturnType operator()(ArgType in) {
-    return in * scalar_;
-  }
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in * scalar_; }
 
  private:
   ArgType scalar_;
@@ -64,7 +58,8 @@ struct multiplies_scalar {
 template <typename ArgType, typename ReturnType = ArgType>
 struct divides_check_zero {
  public:
-  __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b) {
+  __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b)
+  {
     return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
   }
 };
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 8adeb5295f..8a74e78a79 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -52,47 +52,6 @@ void gemm(const raft::handle_t& handle,
     cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/gemm.hpp
-=======
-template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          cudaStream_t stream)
-{
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemm(
-    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
-}
-
-/**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible
- * combinations of operand layouts.
- * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
- * @param handle raft handle
- * @param z output matrix of size M rows x N columns
- * @param x input matrix of size M rows x K columns
- * @param y input matrix of size K rows x N columns
- * @param _M number of rows of X and Z
- * @param _N number of rows of Y and columns of Z
- * @param _K number of columns of X and rows of Y
- * @param isZColMajor Storage layout of Z. true = col major, false = row major
- * @param isXColMajor Storage layout of X. true = col major, false = row major
- * @param isYColMajor Storage layout of Y. true = col major, false = row major
- * @param stream cuda stream
- * @param alpha scalar
- * @param beta scalar
- */
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/gemm.cuh
 template <typename T>
 void gemm(const raft::handle_t& handle,
           T* z,
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index dc75b70509..854b2333d6 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-//for cmath:
+// for cmath:
 #define _USE_MATH_DEFINES
 
 #include <cmath>
@@ -41,14 +41,14 @@ namespace spectral {
 namespace detail {
 
 // curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            float *outputPtr, size_t n,
-                                            float mean, float stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
   return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
 }
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            double *outputPtr, size_t n,
-                                            double mean, double stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
   return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
 }
 
@@ -56,7 +56,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
 // Helper functions
 // =========================================================
 
-/**  
+/**
  *  @brief  Perform Lanczos iteration
  *    Lanczos iteration is performed on a shifted matrix A+shift*I.
  *  @tparam index_type_t the type of data used for indexing.
@@ -86,25 +86,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t *iter, index_type_t maxIter, value_type_t shift,
-  value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev) {
+int performLanczosIteration(handle_t const& handle,
+                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            index_type_t* iter,
+                            index_type_t maxIter,
+                            value_type_t shift,
+                            value_type_t tol,
+                            bool reorthogonalize,
+                            value_type_t* __restrict__ alpha_host,
+                            value_type_t* __restrict__ beta_host,
+                            value_type_t* __restrict__ lanczosVecs_dev,
+                            value_type_t* __restrict__ work_dev)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful variables
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one    = 1;
   constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero = 0;
+  constexpr value_type_t zero   = 0;
   value_type_t alpha;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
@@ -118,29 +123,28 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev,
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                               lanczosVecs_dev,
                                n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice, stream));
+                               cudaMemcpyDeviceToDevice,
+                               stream));
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1,
-                           lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host,
-                           stream));
+    CUBLAS_CHECK(cublasdot(
+      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1,
-                            lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1,
-                            beta_host, stream));
+    CUBLAS_CHECK(cublasaxpy(
+      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n),
-                            1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
   }
 
   // -------------------------------------------------------
@@ -152,65 +156,121 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(
-        lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n,
-        n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift,
-          lanczosVecs_dev + IDX(0, *iter, n));
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                               lanczosVecs_dev + (*iter - 1) * n,
+                               n * sizeof(value_type_t),
+                               cudaMemcpyDeviceToDevice,
+                               stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
 
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1),
-                               sizeof(value_type_t), cudaMemcpyDeviceToHost,
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
+
+      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                               work_dev + (*iter - 1),
+                               sizeof(value_type_t),
+                               cudaMemcpyDeviceToHost,
                                stream));
 
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h, n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                             lanczosVecs_dev + IDX(0, *iter, n), 1,
-                             alpha_host + (*iter - 1), stream));
+      CUBLAS_CHECK(cublasdot(cublas_h,
+                             n,
+                             lanczosVecs_dev + IDX(0, *iter - 1, n),
+                             1,
+                             lanczosVecs_dev + IDX(0, *iter, n),
+                             1,
+                             alpha_host + (*iter - 1),
+                             stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 1, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 2, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1,
-                            beta_host + *iter - 1, stream));
+    CUBLAS_CHECK(cublasnrm2(
+      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
     if (beta_host[*iter - 1] <= tol) break;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha,
-                            lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
   CUDA_TRY(cudaStreamSynchronize(stream));
@@ -218,7 +278,7 @@ int performLanczosIteration(
   return 0;
 }
 
-/** 
+/**
  *  @brief  Find Householder transform for 3-dimensional system
  *    Given an input vector v=[x,y,z]', this function finds a
  *    Householder transform P such that P*v is a multiple of
@@ -236,8 +296,8 @@ int performLanczosIteration(
  *    matrix. Matrix dimensions are 3 x 3.
  */
 template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t *v, value_type_t *Pv,
-                             value_type_t *P) {
+static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
+{
   // Compute norm of vector
   *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
 
@@ -247,8 +307,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   v[0] -= *Pv;
 
   // Normalize Householder vector
-  value_type_t normHouseholder =
-    std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
   if (normHouseholder != 0) {
     v[0] /= normHouseholder;
     v[1] /= normHouseholder;
@@ -262,11 +321,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   // Construct Householder matrix
   index_type_t i, j;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1;
+    for (i = 0; i < 3; ++i)
+      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i)
+    P[IDX(i, i, 3)] += 1;
 }
 
-/**  
+/**
  *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
  *    The Householder transform is pre-applied to the top three rows
  *  of the matrix and post-applied to the left three columns. The
@@ -278,7 +339,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
  *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
  */
 template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
+static void applyHouseholder3(const value_type_t* v, value_type_t* A)
+{
   // Loop indices
   index_type_t i, j;
   // Dot product between Householder vector and matrix row/column
@@ -287,19 +349,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
   // Pre-apply Householder transform
   for (j = 0; j < 4; ++j) {
     vDotA = 0;
-    for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+    for (i = 0; i < 3; ++i)
+      vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i)
+      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
   }
 
   // Post-apply Householder transform
   for (i = 0; i < 4; ++i) {
     vDotA = 0;
-    for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+    for (j = 0; j < 3; ++j)
+      vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j)
+      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
   }
 }
 
-/**  
+/**
  *  @brief  Perform one step of Francis QR algorithm
  *    Equivalent to two steps of the classical QR algorithm on a
  *    tridiagonal matrix.
@@ -320,10 +386,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n, value_type_t shift1,
-                              value_type_t shift2, value_type_t *alpha,
-                              value_type_t *beta, value_type_t *V,
-                              value_type_t *work) {
+static int francisQRIteration(index_type_t n,
+                              value_type_t shift1,
+                              value_type_t shift2,
+                              value_type_t* alpha,
+                              value_type_t* beta,
+                              value_type_t* V,
+                              value_type_t* work)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
@@ -353,30 +423,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
   householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
   householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
 
   // Apply initial Householder transform to create bulge
   memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 4; ++i)
+    bulge[IDX(i, i, 4)] = alpha[i];
   for (i = 0; i < 3; ++i) {
     bulge[IDX(i + 1, i, 4)] = beta[i];
     bulge[IDX(i, i + 1, 4)] = beta[i];
   }
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix,
-                             3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
   memcpy(V, work, 3 * n * sizeof(value_type_t));
 
   // Chase bulge to bottom-right of matrix with Householder transforms
   for (pos = 0; pos < n - 4; ++pos) {
     // Move to next position
-    alpha[pos] = bulge[IDX(0, 0, 4)];
+    alpha[pos]     = bulge[IDX(0, 0, 4)];
     householder[0] = bulge[IDX(1, 0, 4)];
     householder[1] = bulge[IDX(2, 0, 4)];
     householder[2] = bulge[IDX(3, 0, 4)];
     for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+      for (i = 0; i < 3; ++i)
+        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
     bulge[IDX(3, 0, 4)] = 0;
     bulge[IDX(3, 1, 4)] = 0;
     bulge[IDX(3, 2, 4)] = beta[pos + 3];
@@ -386,22 +456,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
     bulge[IDX(3, 3, 4)] = alpha[pos + 4];
 
     // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos,
-                                                 householderMatrix);
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
     applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n),
-                               n, householderMatrix, 3, 0, work, n);
+    Lapack<value_type_t>::gemm(
+      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
     memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
   }
 
   // Apply penultimate Householder transform
   //   Values in the last row and column are zero
-  alpha[n - 4] = bulge[IDX(0, 0, 4)];
+  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = bulge[IDX(3, 0, 4)];
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
   bulge[IDX(3, 0, 4)] = 0;
   bulge[IDX(3, 1, 4)] = 0;
   bulge[IDX(3, 2, 4)] = 0;
@@ -409,37 +479,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   bulge[IDX(1, 3, 4)] = 0;
   bulge[IDX(2, 3, 4)] = 0;
   bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
 
   // Apply final Householder transform
   //   Values in the last two rows and columns are zero
-  alpha[n - 3] = bulge[IDX(0, 0, 4)];
+  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = 0;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3,
-                                               householderMatrix);
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
 
   // Bulge has been eliminated
   alpha[n - 2] = bulge[IDX(0, 0, 4)];
   alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2] = bulge[IDX(1, 0, 4)];
+  beta[n - 2]  = bulge[IDX(1, 0, 4)];
 
   return 0;
 }
 
-/**  
+/**
  *  @brief  Perform implicit restart of Lanczos algorithm
  *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
  *  @tparam index_type_t the type of data used for indexing.
@@ -475,23 +544,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(
-  handle_t const &handle, index_type_t n, index_type_t iter,
-  index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, bool smallest_eig) {
+static int lanczosRestart(handle_t const& handle,
+                          index_type_t n,
+                          index_type_t iter,
+                          index_type_t iter_new,
+                          value_type_t* shiftUpper,
+                          value_type_t* shiftLower,
+                          value_type_t* __restrict__ alpha_host,
+                          value_type_t* __restrict__ beta_host,
+                          value_type_t* __restrict__ V_host,
+                          value_type_t* __restrict__ work_host,
+                          value_type_t* __restrict__ lanczosVecs_dev,
+                          value_type_t* __restrict__ work_dev,
+                          bool smallest_eig)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful constants
   constexpr value_type_t zero = 0;
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Loop index
   index_type_t i;
@@ -502,12 +578,12 @@ static int lanczosRestart(
   index_type_t restartSteps = iter - iter_new;
 
   // Ritz values from Lanczos method
-  value_type_t *ritzVals_host = work_host + 3 * iter;
+  value_type_t* ritzVals_host = work_host + 3 * iter;
   // Shifts for implicit restart
-  value_type_t *shifts_host;
+  value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t *V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + n * iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -525,7 +601,8 @@ static int lanczosRestart(
 
   // Initialize similarity transform with identity matrix
   memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1;
+  for (i = 0; i < iter; ++i)
+    V_host[IDX(i, i, iter)] = 1;
 
   // Determine interval to suppress eigenvalues
   if (smallest_eig) {
@@ -549,49 +626,71 @@ static int lanczosRestart(
   // Calculate Chebyshev nodes as shifts
   shifts_host = ritzVals_host;
   for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] =
-      cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
     shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
     shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
   }
 
   // Apply Francis QR algorithm to implicitly restart Lanczos
   for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host,
-                           beta_host, V_host, work_host))
+    if (francisQRIteration(
+          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] =
-    beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(
-    cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev,
-    n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1,
-    lanczosVecs_dev + IDX(0, iter, n), 1, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  CUBLAS_CHECK(cublasgemv(cublas_h,
+                          CUBLAS_OP_N,
+                          n,
+                          iter,
+                          beta_host + iter_new - 1,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev + IDX(0, iter_new, iter),
+                          1,
+                          beta_host + iter - 1,
+                          lanczosVecs_dev + IDX(0, iter, n),
+                          1,
+                          stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter,
-                          &one, lanczosVecs_dev, n, V_dev, iter, &zero,
-                          work_dev, n, stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          iter_new,
+                          iter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev,
+                          iter,
+                          &zero,
+                          work_dev,
+                          n,
+                          stream));
+
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                           work_dev,
                            n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice, stream));
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
   // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(
-    lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n),
-    n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                           lanczosVecs_dev + IDX(0, iter, n),
+                           n * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1,
-                          beta_host + iter_new - 1, stream));
+  CUBLAS_CHECK(cublasnrm2(
+    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta,
-                          lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -602,19 +701,28 @@ static int lanczosRestart(
 namespace detail {
 
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *shift,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
   using namespace raft::spectral::detail;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -634,21 +742,20 @@ int computeSmallestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -661,12 +768,11 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -689,10 +795,18 @@ int computeSmallestEigenvectors(
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
-  *shift = 0;
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  *shift   = 0;
+  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0.0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
 
   // Determine largest eigenvalue
@@ -707,9 +821,17 @@ int computeSmallestEigenvectors(
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -726,9 +848,19 @@ int computeSmallestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        true);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -737,9 +869,17 @@ int computeSmallestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -750,32 +890,52 @@ int computeSmallestEigenvectors(
   }
 
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter),
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter),
                            nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
-  CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host,
+  CUDA_TRY(cudaMemcpyAsync(work_dev,
+                           Z_host,
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
@@ -783,20 +943,25 @@ int computeSmallestEigenvectors(
 }
 
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
   using namespace raft::spectral::detail;
 
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -806,8 +971,8 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
@@ -815,29 +980,50 @@ int computeSmallestEigenvectors(
   // Perform Lanczos method
   index_type_t effIter;
   value_type_t shift;
-  int status = raft::detail::computeSmallestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = raft::detail::computeSmallestEigenvectors(handle,
+                                                         &A,
+                                                         nEigVecs,
+                                                         maxIter,
+                                                         restartIter,
+                                                         tol,
+                                                         reorthogonalize,
+                                                         &effIter,
+                                                         &iter,
+                                                         &shift,
+                                                         alpha_host,
+                                                         beta_host,
+                                                         lanczosVecs_dev.raw(),
+                                                         work_dev.raw(),
+                                                         eigVals_dev,
+                                                         eigVecs_dev,
+                                                         seed);
 
   // Clean up and return
   return status;
 }
 
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
   using namespace raft::spectral::detail;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -853,8 +1039,8 @@ int computeLargestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that LAPACK is enabled
@@ -864,15 +1050,14 @@ int computeLargestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -885,12 +1070,11 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -910,13 +1094,21 @@ int computeLargestEigenvectors(
   CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
+  *effIter               = 0;
   value_type_t shift_val = 0.0;
-  value_type_t *shift = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  value_type_t* shift    = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -933,9 +1125,19 @@ int computeLargestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        false);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -944,9 +1146,17 @@ int computeLargestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -956,15 +1166,18 @@ int computeLargestEigenvectors(
     WARNING("implicitly restarted Lanczos failed to converge");
   }
   for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0;
+    for (int j = 0; j < restartIter; ++j)
+      Z_host[i * restartIter + j] = 0;
   }
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // note: We need to pick the top nEigVecs eigenvalues
@@ -989,29 +1202,45 @@ int computeLargestEigenvectors(
   //}
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
 
   for (i = 0; i < top_eigenparis_idx_offset; ++i)
     work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
   // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(
-    eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-    nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                           nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   // skip smallest eigenvector if needed
   CUDA_TRY(cudaMemcpyAsync(work_dev,
                            Z_host + (top_eigenparis_idx_offset * (*effIter)),
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
@@ -1019,18 +1248,23 @@ int computeLargestEigenvectors(
 }
 
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1040,18 +1274,30 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
-  int status = raft::detail::computeLargestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = raft::detail::computeLargestEigenvectors(handle,
+                                                        &A,
+                                                        nEigVecs,
+                                                        maxIter,
+                                                        restartIter,
+                                                        tol,
+                                                        reorthogonalize,
+                                                        &effIter,
+                                                        &iter,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        lanczosVecs_dev.raw(),
+                                                        work_dev.raw(),
+                                                        eigVals_dev,
+                                                        eigVecs_dev,
+                                                        seed);
 
   // Clean up and return
   return status;
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 0e649fb937..7f1ba3da0d 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -25,21 +25,18 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
-                          Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
+{
   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    out[idx] = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
 }
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-             const InType *in, Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+void mapImpl(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index a7031bc48f..089bc627be 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -1,18 +1,18 @@
 /*
-* Copyright (c) 2021, NVIDIA CORPORATION.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -25,50 +25,66 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-struct sum_tag {};
+struct sum_tag {
+};
 
 template <typename InType, typename OutType, int TPB>
-__device__ void reduce(OutType *out, const InType acc, sum_tag) {
+__device__ void reduce(OutType* out, const InType acc, sum_tag)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Sum(acc);
-  if (threadIdx.x == 0) {
-    raft::myAtomicAdd(out, tmp);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); }
 }
 
 template <typename InType, typename OutType, int TPB, typename ReduceLambda>
-__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
+__device__ void reduce(OutType* out, const InType acc, ReduceLambda op)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
-  if (threadIdx.x == 0) {
-    raft::myAtomicReduce(out, tmp, op);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); }
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
-                                    MapOp map, ReduceLambda op,
-                                    const InType *in, Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+__global__ void mapThenReduceKernel(OutType* out,
+                                    size_t len,
+                                    OutType neutral,
+                                    MapOp map,
+                                    ReduceLambda op,
+                                    const InType* in,
+                                    Args... args)
+{
   OutType acc = neutral;
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+  auto idx    = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    acc = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { acc = map(in[idx], args[idx]...); }
 
   __syncthreads();
 
   reduce<InType, OutType, TPB>(out, acc, op);
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
-                       ReduceLambda op, cudaStream_t stream, const InType *in,
-                       Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+void mapThenReduceImpl(OutType* out,
+                       size_t len,
+                       OutType neutral,
+                       MapOp map,
+                       ReduceLambda op,
+                       cudaStream_t stream,
+                       const InType* in,
+                       Args... args)
+{
   raft::update_device(out, &neutral, 1, stream);
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index e8956521df..8fdee6d30e 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -103,42 +103,10 @@ __global__ void stridedReductionKernel(OutType* dots,
     raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op);
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/strided_reduction.cuh
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
-=======
-/**
- * @brief Compute reduction of the input matrix along the strided dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
- */
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
           typename MainLambda   = raft::Nop<InType, IdxType>,
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/strided_reduction.cuh
           typename ReduceLambda = raft::Sum<OutType>,
           typename FinalLambda  = raft::Nop<OutType>>
 void stridedReduction(OutType* dots,
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index a58888a24f..a2e91a381a 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -25,20 +25,23 @@ namespace linalg {
 namespace detail {
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                           const math_t *singleScalarDev,
-                                           IdxType len) {
-  //TODO: kernel do not use shared memory in current implementation
+__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+                                           const math_t* inDev,
+                                           const math_t* singleScalarDev,
+                                           IdxType len)
+{
+  // TODO: kernel do not use shared memory in current implementation
   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] - *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
 }
 
 template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t *outDev, const math_t *inDev,
-                       const math_t *singleScalarDev, IdxType len,
-                       cudaStream_t stream) {
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
   // Just for the note - there is no way to express such operation with cuBLAS in effective way
   // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 691bef4d35..0d9cbc05dc 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -145,8 +145,7 @@ void svdEig(const raft::handle_t& handle,
                      beta,
                      stream);
 
-  raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S,
-                      stream);
+  raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
 
   raft::matrix::colReverse(V, n_cols, n_cols, stream);
   raft::matrix::rowReverse(S, n_cols, 1, stream);
@@ -239,67 +238,6 @@ void svdJacobi(const raft::handle_t& handle,
   RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/svd.cuh
-=======
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param U: left singular vectors of size n_rows x k
- * @param S: square matrix with singular values on its diagonal, k x k
- * @param V: right singular vectors of size n_cols x k
- * @param out: reconstructed matrix to be returned
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values
- * @param stream cuda stream
- */
-template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
-                       math_t* U,
-                       math_t* S,
-                       math_t* V,
-                       math_t* out,
-                       int n_rows,
-                       int n_cols,
-                       int k,
-                       cudaStream_t stream)
-{
-  const math_t alpha = 1.0, beta = 0.0;
-  rmm::device_uvector<math_t> SVT(k * n_cols, stream);
-
-  raft::linalg::gemm(
-    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle,
-                     U,
-                     n_rows,
-                     k,
-                     SVT.data(),
-                     out,
-                     n_rows,
-                     n_cols,
-                     CUBLAS_OP_N,
-                     CUBLAS_OP_N,
-                     alpha,
-                     beta,
-                     stream);
-}
-
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param A_d: input matrix
- * @param U: left singular vectors of size n_rows x k
- * @param S_vec: singular values as a vector
- * @param V: right singular vectors of size n_cols x k
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values to be computed, 1.0 for normal SVD
- * @param tol: tolerance for the evaluation
- * @param stream cuda stream
- */
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/svd.cuh
 template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t& handle,
                          math_t* A_d,
diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
index 5e93157ed7..0089400201 100644
--- a/cpp/include/raft/linalg/detail/unary_op.cuh
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -51,55 +51,12 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/unary_op.cuh
-template <typename InType, typename Lambda, typename IdxType = int,
-          typename OutType = InType, int TPB = 256>
-void unaryOpCaller(OutType *out, const InType *in, IdxType len, Lambda op,
-                   cudaStream_t stream) {
-  if (len <= 0) return;  //silently skip in case of 0 length input
-  constexpr auto maxSize =
-    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t inAddr = uint64_t(in);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
-      outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
-             outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
-             outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
-             outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-=======
-/**
- * @brief perform element-wise unary operation in the input array
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
- */
 template <typename InType,
           typename Lambda,
           typename IdxType = int,
           typename OutType = InType,
           int TPB          = 256>
-void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
 {
   if (len <= 0) return;  // silently skip in case of 0 length input
   constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
@@ -114,7 +71,6 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_
     unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
     unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/unary_op.cuh
   } else if (1 / maxSize) {
     unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else {
@@ -129,33 +85,9 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
   if (idx < len) { op(out + idx, idx); }
 }
 
-<<<<<<< HEAD:cpp/include/raft/linalg/detail/unary_op.cuh
-template <typename OutType, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void writeOnlyUnaryOpCaller(OutType *out, IdxType len, Lambda op,
-                            cudaStream_t stream) {
-=======
-/**
- * @brief Perform an element-wise unary operation into the output array
- *
- * Compared to `unaryOp()`, this method does not do any reads from any inputs
- *
- * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB     threads-per-block in the final kernel launched
- *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
- * @param[in]  stream cuda stream where to launch work
- */
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 {
->>>>>>> upstream/branch-22.02:cpp/include/raft/linalg/unary_op.cuh
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
   writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index e4eead777c..ecf0d3a48d 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -36,8 +36,8 @@ using detail::divides_scalar;
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                  cudaStream_t stream) {
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(out, in, len, divides_scalar<math_t>(scalar), stream);
 }
 /** @} */
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 288b43f27f..91a475f25f 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -35,9 +35,14 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
-           std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals,
-           cudaStream_t stream) {
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
   detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
 }
 
@@ -62,11 +67,17 @@ using detail::OVERWRITE_INPUT;
  * @{
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
-              EigVecMemUsage memUsage, cudaStream_t stream) {
-  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors,
-                   eig_vals, memUsage, stream);
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
 }
 
 #endif
@@ -86,11 +97,17 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @{
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
-               int n_cols, math_t *eig_vectors, math_t *eig_vals,
-               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream,
-                    tol, sweeps);
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol = 1.e-7,
+               int sweeps = 15)
+{
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 63b824e6f7..5a5b5c647b 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -38,19 +38,17 @@ using detail::adds_scalar;
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
-               cudaStream_t stream) {
-  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar),
-                        stream);
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar), stream);
 }
 
 using detail::multiplies_scalar;
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
-                    cudaStream_t stream) {
-  raft::linalg::unaryOp(out, in, len,
-                        multiplies_scalar<InType, OutType>(scalar), stream);
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, multiplies_scalar<InType, OutType>(scalar), stream);
 }
 /** @} */
 
@@ -66,34 +64,39 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
-                     IdxType len, cudaStream_t stream) {
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
-                   IdxType len, cudaStream_t stream) {
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
 }
 
 using detail::divides_check_zero;
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
-                            IdxType len, cudaStream_t stream) {
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
 }
 /** @} */
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 9326714a41..624aa7232b 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -40,27 +40,45 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
-          math_t beta, cudaStream_t stream) {
-  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
-               trans_b, alpha, beta, stream);
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b,
-          cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
-  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
-       trans_b, alpha, beta, stream);
+  math_t beta  = math_t(0);
+  gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
 /**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
  * combinations of operand layouts.
  * It computes the following equation: Z = alpha . X * Y + beta . Z
  * @tparam T Data type of input/output matrices (float/double)
@@ -79,11 +97,22 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
-          int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
-          cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
-  detail::gemm(handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor,
-               isYColMajor, stream, alpha, beta);
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
+  detail::gemm(
+    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
 }
 
 }  // end namespace linalg
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 87ffb76163..34db473edb 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -74,19 +74,41 @@ namespace raft {
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *shift,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
-  return raft::detail::computeSmallestEigenvectors(
-    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter,
-    totalIter, shift, alpha_host, beta_host, lanczosVecs_dev, work_dev,
-    eigVals_dev, eigVecs_dev, seed);
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
+  return raft::detail::computeSmallestEigenvectors(handle,
+                                                   A,
+                                                   nEigVecs,
+                                                   maxIter,
+                                                   restartIter,
+                                                   tol,
+                                                   reorthogonalize,
+                                                   effIter,
+                                                   totalIter,
+                                                   shift,
+                                                   alpha_host,
+                                                   beta_host,
+                                                   lanczosVecs_dev,
+                                                   work_dev,
+                                                   eigVals_dev,
+                                                   eigVecs_dev,
+                                                   seed);
 }
 
 /**
@@ -127,15 +149,29 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
-  return raft::detail::computeSmallestEigenvectors(
-    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter,
-    eigVals_dev, eigVecs_dev, seed);
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
+  return raft::detail::computeSmallestEigenvectors(handle,
+                                                   A,
+                                                   nEigVecs,
+                                                   maxIter,
+                                                   restartIter,
+                                                   tol,
+                                                   reorthogonalize,
+                                                   iter,
+                                                   eigVals_dev,
+                                                   eigVecs_dev,
+                                                   seed);
 }
 
 // =========================================================
@@ -187,19 +223,39 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
-  return raft::detail::computeLargestEigenvectors(
-    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, effIter,
-    totalIter, alpha_host, beta_host, lanczosVecs_dev, work_dev, eigVals_dev,
-    eigVecs_dev, seed);
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
+  return raft::detail::computeLargestEigenvectors(handle,
+                                                  A,
+                                                  nEigVecs,
+                                                  maxIter,
+                                                  restartIter,
+                                                  tol,
+                                                  reorthogonalize,
+                                                  effIter,
+                                                  totalIter,
+                                                  alpha_host,
+                                                  beta_host,
+                                                  lanczosVecs_dev,
+                                                  work_dev,
+                                                  eigVals_dev,
+                                                  eigVecs_dev,
+                                                  seed);
 }
 
 /**
@@ -240,15 +296,29 @@ int computeLargestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
-  return raft::detail::computeLargestEigenvectors(
-    handle, A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, iter,
-    eigVals_dev, eigVecs_dev, seed);
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
+  return raft::detail::computeLargestEigenvectors(handle,
+                                                  A,
+                                                  nEigVecs,
+                                                  maxIter,
+                                                  restartIter,
+                                                  tol,
+                                                  reorthogonalize,
+                                                  iter,
+                                                  eigVals_dev,
+                                                  eigVecs_dev,
+                                                  seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index 71ac959f77..c14fb7ba2b 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -41,10 +41,9 @@ template <typename InType,
           int TPB = 256,
           typename... Args,
           typename OutType = InType>
-void map(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-         const InType *in, Args... args) {
-  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream,
-                                                        in, args...);
+void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 149c2401f5..d4d7087339 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -40,11 +40,11 @@ template <typename InType,
           int TPB = 256,
           typename... Args,
           typename OutType = InType>
-void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-                      const InType *in, Args... args) {
-  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB,
-                            Args...>(out, len, (OutType)0, map,
-                                     detail::sum_tag(), stream, in, args...);
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
+    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
 }
 
 /**
@@ -65,11 +65,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, typename ReduceLambda, int TPB = 256,
-          typename OutType = InType, typename... Args>
-void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map,
-                   ReduceLambda op, cudaStream_t stream, const InType *in,
-                   Args... args) {
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
   detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
     out, len, neutral, map, op, stream, in, args...);
 }
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index 2cfaa0564c..f088ef4dce 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -45,11 +45,17 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
-                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
-                    cudaStream_t stream) {
-  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op,
-                         stream);
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
 /**
@@ -77,11 +83,18 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
-                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor,
-                         bcastAlongRows, op, stream);
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index ad19b361c2..b0e9eed5e2 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -37,8 +37,13 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
-            int n_rows, int n_cols, cudaStream_t stream) {
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
   detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
 }
 
@@ -53,8 +58,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
-             int n_rows, int n_cols, cudaStream_t stream) {
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
   detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
 }
 /** @} */
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 3935e648dc..7e2b5229ec 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -48,17 +48,24 @@ namespace linalg {
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
-                      OutType init, cudaStream_t stream, bool inplace = false,
-                      MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op = raft::Nop<OutType>()) {
-  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                           reduce_op, final_op);
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 820a29cb33..88946646c8 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -62,7 +62,8 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -73,9 +74,12 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream
  * @remark block size has not been tuned
  */
 template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t *outDev, const math_t *inDev,
-                       const math_t *singleScalarDev, IdxType len,
-                       cudaStream_t stream) {
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
   detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 970c339090..62ac19b592 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -40,18 +40,42 @@ namespace linalg {
 // TODO: couldn't template this function due to cusolverDnSgesvd and
 // cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
-void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
-           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
-           bool trans_right, bool gen_left_vec, bool gen_right_vec,
-           cudaStream_t stream) {
-  detail::svdQR(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs,
-                right_sing_vecs, trans_right, gen_left_vec, gen_right_vec,
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
+  detail::svdQR(handle,
+                in,
+                n_rows,
+                n_cols,
+                sing_vals,
+                left_sing_vecs,
+                right_sing_vecs,
+                trans_right,
+                gen_left_vec,
+                gen_right_vec,
                 stream);
 }
 
 template <typename T>
-void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
-            T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
   detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
 }
 
@@ -73,13 +97,31 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-               math_t *sing_vals, math_t *left_sing_vecs,
-               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
-               math_t tol, int max_sweeps, cudaStream_t stream) {
-  detail::svdJacobi(handle, in, n_rows, n_cols, sing_vals, left_sing_vecs,
-                    right_sing_vecs, gen_left_vec, gen_right_vec, tol,
-                    max_sweeps, stream);
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
+  detail::svdJacobi(handle,
+                    in,
+                    n_rows,
+                    n_cols,
+                    sing_vals,
+                    left_sing_vecs,
+                    right_sing_vecs,
+                    gen_left_vec,
+                    gen_right_vec,
+                    tol,
+                    max_sweeps,
+                    stream);
 }
 
 /**
@@ -96,16 +138,34 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
-                       math_t *V, math_t *out, int n_rows, int n_cols, int k,
-                       cudaStream_t stream) {
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
   const math_t alpha = 1.0, beta = 0.0;
   rmm::device_uvector<math_t> SVT(k * n_cols, stream);
 
-  raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
-                     CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols,
-                     CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+  raft::linalg::gemm(
+    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle,
+                     U,
+                     n_rows,
+                     k,
+                     SVT.data(),
+                     out,
+                     n_rows,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     stream);
 }
 
 /**
@@ -123,11 +183,18 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
-                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
-                         int k, math_t tol, cudaStream_t stream) {
-  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols,
-                                     k, tol, stream);
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
+  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index 13795f9297..c54e3cc1c3 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -36,10 +36,13 @@ namespace linalg {
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val);`
  */
-template <typename InType, typename Lambda, typename IdxType = int,
-          typename OutType = InType, int TPB = 256>
-void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
-             cudaStream_t stream) {
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
   detail::unaryOpCaller(out, in, len, op, stream);
 }
 
@@ -60,10 +63,9 @@ void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
  *                    where outLocationOffset will be out + idx.
  * @param[in]  stream cuda stream where to launch work
  */
-template <typename OutType, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op,
-                      cudaStream_t stream) {
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
   detail::writeOnlyUnaryOpCaller(out, len, op, stream);
 }
 
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index b91516279a..8fbd68f0a6 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -24,13 +24,8 @@
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/cuda_utils.cuh>
-<<<<<<< HEAD
 #include <raft/linalg/unary_op.hpp>
-#include <raft/sparse/csr.cuh>
-=======
-#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/csr.hpp>
->>>>>>> upstream/branch-22.02
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index efb8d0201d..6cd0e3154d 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -21,7 +21,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 27a23034c5..e65c79b5bd 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -17,7 +17,7 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 8d19c61b63..ae95fac0b2 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -83,9 +83,15 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    raft::linalg::eigSelDC(handle, cov_matrix.data(), params.n_row,
-                           params.n_col, 3, eig_vectors.data(), eig_vals.data(),
-                           EigVecMemUsage::OVERWRITE_INPUT, stream);
+    raft::linalg::eigSelDC(handle,
+                           cov_matrix.data(),
+                           params.n_row,
+                           params.n_col,
+                           3,
+                           eig_vectors.data(),
+                           eig_vals.data(),
+                           EigVecMemUsage::OVERWRITE_INPUT,
+                           stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 

From db817f62a6e8cdc6b65bdee18829a210d071e8a4 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 14 Dec 2021 13:19:52 -0800
Subject: [PATCH 07/17] changing h extensions to hpp

---
 cpp/include/raft/distance/detail/distance.cuh               | 2 +-
 cpp/include/raft/distance/distance.hpp                      | 2 +-
 cpp/include/raft/handle.hpp                                 | 4 ++--
 cpp/include/raft/label/merge_labels.cuh                     | 2 +-
 .../raft/linalg/{cublas_wrappers.h => cublas_wrappers.hpp}  | 0
 .../linalg/{cusolver_wrappers.h => cusolver_wrappers.hpp}   | 0
 cpp/include/raft/linalg/detail/cholesky_r1_update.hpp       | 4 ++--
 cpp/include/raft/linalg/detail/eig.hpp                      | 2 +-
 cpp/include/raft/linalg/detail/gemm.hpp                     | 2 +-
 cpp/include/raft/linalg/detail/lanczos.hpp                  | 2 +-
 cpp/include/raft/linalg/detail/qr.cuh                       | 4 ++--
 cpp/include/raft/linalg/detail/svd.cuh                      | 6 +++---
 .../raft/linalg/{distance_type.h => distance_type.hpp}      | 0
 cpp/include/raft/linalg/{gemv.h => gemv.hpp}                | 2 +-
 cpp/include/raft/linalg/{init.h => init.hpp}                | 0
 cpp/include/raft/linalg/{transpose.h => transpose.hpp}      | 2 +-
 cpp/include/raft/matrix/matrix.hpp                          | 2 +-
 cpp/include/raft/sparse/distance/detail/bin_distance.cuh    | 2 +-
 cpp/include/raft/sparse/distance/detail/ip_distance.cuh     | 2 +-
 cpp/include/raft/sparse/distance/detail/l2_distance.cuh     | 2 +-
 cpp/include/raft/sparse/distance/detail/lp_distance.cuh     | 2 +-
 cpp/include/raft/sparse/distance/distance.hpp               | 2 +-
 cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh | 2 +-
 cpp/include/raft/sparse/selection/detail/knn.cuh            | 2 +-
 cpp/include/raft/sparse/selection/detail/knn_graph.cuh      | 2 +-
 cpp/include/raft/sparse/selection/knn.hpp                   | 2 +-
 cpp/include/raft/sparse/selection/knn_graph.hpp             | 2 +-
 cpp/include/raft/spatial/knn/ann_common.h                   | 2 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp                 | 2 +-
 cpp/include/raft/spatial/knn/ball_cover_common.h            | 2 +-
 cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh | 2 +-
 cpp/include/raft/spatial/knn/detail/common_faiss.h          | 2 +-
 cpp/include/raft/spatial/knn/detail/haversine_distance.cuh  | 2 +-
 .../raft/spatial/knn/detail/knn_brute_force_faiss.cuh       | 2 +-
 cpp/include/raft/spatial/knn/detail/processing.hpp          | 2 +-
 cpp/include/raft/spectral/kmeans.hpp                        | 2 +-
 cpp/include/raft/spectral/lapack.hpp                        | 4 ++--
 cpp/include/raft/spectral/matrix_wrappers.hpp               | 2 +-
 cpp/test/linalg/cholesky_r1.cu                              | 2 +-
 cpp/test/linalg/gemv.cu                                     | 2 +-
 cpp/test/linalg/reduce.cuh                                  | 2 +-
 cpp/test/linalg/transpose.cu                                | 2 +-
 cpp/test/sparse/connect_components.cu                       | 4 ++--
 cpp/test/sparse/dist_coo_spmv.cu                            | 2 +-
 cpp/test/sparse/distance.cu                                 | 2 +-
 cpp/test/sparse/knn.cu                                      | 2 +-
 cpp/test/sparse/linkage.cu                                  | 4 ++--
 cpp/test/spatial/ball_cover.cu                              | 2 +-
 cpp/test/spatial/fused_l2_knn.cu                            | 2 +-
 cpp/test/spatial/haversine.cu                               | 2 +-
 cpp/test/spatial/knn.cu                                     | 2 +-
 51 files changed, 55 insertions(+), 55 deletions(-)
 rename cpp/include/raft/linalg/{cublas_wrappers.h => cublas_wrappers.hpp} (100%)
 rename cpp/include/raft/linalg/{cusolver_wrappers.h => cusolver_wrappers.hpp} (100%)
 rename cpp/include/raft/linalg/{distance_type.h => distance_type.hpp} (100%)
 rename cpp/include/raft/linalg/{gemv.h => gemv.hpp} (99%)
 rename cpp/include/raft/linalg/{init.h => init.hpp} (100%)
 rename cpp/include/raft/linalg/{transpose.h => transpose.hpp} (98%)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 9eeccdb827..a004d24ae8 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/canberra.cuh>
 #include <raft/distance/detail/chebyshev.cuh>
@@ -31,6 +30,7 @@
 #include <raft/distance/detail/l1.cuh>
 #include <raft/distance/detail/minkowski.cuh>
 #include <raft/distance/detail/russell_rao.cuh>
+#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 66832c12d2..745d1fea90 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index bba7fabc54..fb45fa13a8 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -32,10 +32,10 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/comms/comms.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 #include "cudart_utils.h"
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index 9818b5d71b..33413fafe5 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -20,8 +20,8 @@
 #include <limits>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/init.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/init.hpp>
 
 namespace raft {
 namespace label {
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.hpp
similarity index 100%
rename from cpp/include/raft/linalg/cublas_wrappers.h
rename to cpp/include/raft/linalg/cublas_wrappers.hpp
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.hpp
similarity index 100%
rename from cpp/include/raft/linalg/cusolver_wrappers.h
rename to cpp/include/raft/linalg/cusolver_wrappers.hpp
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
index db00c5d6fc..641b38ff40 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index 704fe339dc..6475ce969b 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -18,9 +18,9 @@
 
 #include <cuda_runtime_api.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 8a74e78a79..0954097b80 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index 854b2333d6..f7052eef14 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -26,8 +26,8 @@
 #include <curand.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
 #include <raft/spectral/lapack.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 5ca9850900..8dc46eeb9b 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 0d9cbc05dc..81bfa06f27 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -17,13 +17,13 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/linalg/transpose.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 #include <raft/linalg/eig.hpp>
 #include <raft/linalg/gemm.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
diff --git a/cpp/include/raft/linalg/distance_type.h b/cpp/include/raft/linalg/distance_type.hpp
similarity index 100%
rename from cpp/include/raft/linalg/distance_type.h
rename to cpp/include/raft/linalg/distance_type.hpp
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.hpp
similarity index 99%
rename from cpp/include/raft/linalg/gemv.h
rename to cpp/include/raft/linalg/gemv.hpp
index 965cd32a57..7dfd1f1db1 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/cublas_wrappers.hpp>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.hpp
similarity index 100%
rename from cpp/include/raft/linalg/init.h
rename to cpp/include/raft/linalg/init.hpp
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.hpp
similarity index 98%
rename from cpp/include/raft/linalg/transpose.h
rename to cpp/include/raft/linalg/transpose.hpp
index 63dbae1c8a..09e9e67e7b 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index a7a43cff6e..00651a9b62 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -21,10 +21,10 @@
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <algorithm>
 #include <cstddef>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index ad97e0853a..141e5b3e5f 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -19,11 +19,11 @@
 #include <limits.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 03c13df511..0f8b2d99bb 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -18,9 +18,9 @@
 
 #include <limits.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.hpp>
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 8fbd68f0a6..62bfb7671e 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -19,11 +19,11 @@
 #include <raft/spatial/knn/knn.hpp>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 5be9de97c3..d062705b57 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -19,9 +19,9 @@
 #include <limits.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 9b708f4b27..c49730bdb9 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -19,9 +19,9 @@
 #include <raft/cudart_utils.h>
 #include <unordered_set>
 
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/detail/utils.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index fdd03a5faa..0c47b22201 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -23,8 +23,8 @@
 #include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/hierarchy/common.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index 6cd0e3154d..21a40cf626 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -19,8 +19,8 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index 83cb23f513..c96fefdc5d 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -24,7 +24,7 @@
 
 #include <raft/spatial/knn/knn.hpp>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 141026dc82..bfc0c14a8c 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/sparse/selection/detail/knn.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 7af452541f..2a3159900c 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/detail/knn_graph.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 573a23181d..e2df51a62b 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/StandardGpuResources.h>
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index cb2b9e99cd..4495221a34 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -18,8 +18,8 @@
 
 #include <cstdint>
 
-#include <raft/linalg/distance_type.h>
 #include <thrust/transform.h>
+#include <raft/linalg/distance_type.hpp>
 #include "ball_cover_common.h"
 #include "detail/ball_cover.cuh"
 #include "detail/ball_cover/common.cuh"
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index e38124edb6..9ed1d2f726 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <cstdint>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index b7f124c51e..6f223fdb43 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -43,7 +43,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 
 #include <cuml/neighbors/knn.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 5618186dfc..3708523b4f 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -20,7 +20,7 @@
 #include <raft/cuda_utils.cuh>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 049c11514c..50340a284b 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -26,8 +26,8 @@
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
-#include <raft/linalg/distance_type.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 12b7124773..54509b4a51 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -29,11 +29,11 @@
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
-#include <raft/linalg/distance_type.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <cstdint>
 #include <iostream>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.hpp>
 #include <set>
 
 #include "fused_l2_knn.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 905e797841..5a4672e711 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/linalg/norm.hpp>
 #include <raft/linalg/unary_op.hpp>
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index 549dd4917c..cbd0486086 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -29,9 +29,9 @@
 #include <thrust/sort.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
 
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index 35fc22c770..a47c41564c 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -17,9 +17,9 @@
 #pragma once
 #include <cusolverDn.h>
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/error.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 
 // for now; TODO: check if/where this `define` should be;
 //
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 0d79904707..6f9d383c63 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -16,9 +16,9 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/fill.h>
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index b93b0b90e9..85f5d7ada1 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -16,9 +16,9 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 962b17fa24..580effbe50 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -15,8 +15,8 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/linalg/gemv.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/gemv.hpp>
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index dfef1cf054..aae57e136e 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/cublas_wrappers.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index a63b08e970..01962fcd23 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/transpose.hpp>
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index df138e2bdb..2c56a902d4 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -26,8 +26,8 @@
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/selection/knn_graph.hpp>
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 9701ec3259..eae8fec500 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -19,8 +19,8 @@
 #include <cusparse_v2.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index f4f346561c..d635c4a813 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -19,8 +19,8 @@
 #include <cusparse_v2.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/linalg/distance_type.hpp>
 
 #include <raft/sparse/distance/distance.hpp>
 
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 389e8c4b9c..6e4de63e4d 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -17,8 +17,8 @@
 #include <cusparse_v2.h>
 #include <gtest/gtest.h>
 
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/sparse/selection/knn.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 81e6dc4768..51947167cf 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 73c0f87fdd..d63674c13c 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -15,7 +15,7 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 078d5e0eec..303844b0a4 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -19,8 +19,8 @@
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/StandardGpuResources.h>
 
-#include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/detail/common_faiss.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/knn.hpp>
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 171b698265..d28fd55dbe 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -15,8 +15,8 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/linalg/distance_type.h>
 #include <iostream>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 2fb9bd2ca5..839d60095e 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 
-#include <raft/linalg/distance_type.h>
+#include <raft/linalg/distance_type.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
 #include <rmm/device_buffer.hpp>

From abec4d24f444cc1541c5e8d15210fee22d64bc58 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 21 Dec 2021 18:34:59 -0800
Subject: [PATCH 08/17] cublas/cusolver only in detail, wrap up rest of linalg

---
 cpp/include/raft/distance/detail/distance.cuh |   2 +-
 cpp/include/raft/distance/distance.hpp        |   2 +-
 .../{linalg => distance}/distance_type.hpp    |   2 +-
 cpp/include/raft/handle.hpp                   |   4 +-
 cpp/include/raft/linalg/add.hpp               |  10 +-
 cpp/include/raft/linalg/binary_op.hpp         |   2 +-
 .../raft/linalg/cholesky_r1_update.hpp        |   2 +-
 .../raft/linalg/coalesced_reduction.hpp       |   2 +-
 cpp/include/raft/linalg/detail/add.cuh        |  16 +++
 .../raft/linalg/detail/cholesky_r1_update.hpp |  39 +++--
 .../linalg/{ => detail}/cublas_wrappers.hpp   |  41 +-----
 .../linalg/{ => detail}/cusolver_wrappers.hpp |   6 +-
 cpp/include/raft/linalg/detail/divide.hpp     |  34 +++++
 cpp/include/raft/linalg/detail/eig.hpp        |   6 +-
 cpp/include/raft/linalg/detail/eltwise.hpp    |  77 ++++++++++
 cpp/include/raft/linalg/detail/gemm.hpp       |  21 ++-
 cpp/include/raft/linalg/detail/gemv.hpp       | 117 +++++++++++++++
 cpp/include/raft/linalg/detail/init.hpp       |  54 +++++++
 cpp/include/raft/linalg/detail/lanczos.hpp    |   4 +-
 .../raft/linalg/detail/mean_squared_error.hpp |  38 +++++
 cpp/include/raft/linalg/detail/multiply.hpp   |  34 +++++
 cpp/include/raft/linalg/detail/norm.hpp       | 116 +++++++++++++++
 cpp/include/raft/linalg/detail/qr.cuh         |  13 +-
 cpp/include/raft/linalg/detail/reduce.hpp     |  63 ++++++++
 cpp/include/raft/linalg/detail/subtract.cuh   |  14 ++
 cpp/include/raft/linalg/detail/svd.cuh        | 134 ++++++++++--------
 cpp/include/raft/linalg/detail/transpose.hpp  |  81 +++++++++++
 cpp/include/raft/linalg/divide.hpp            |   7 +-
 cpp/include/raft/linalg/eig.hpp               |   6 +-
 cpp/include/raft/linalg/eltwise.hpp           |  21 ++-
 cpp/include/raft/linalg/gemm.hpp              |  23 ++-
 cpp/include/raft/linalg/gemv.hpp              |  33 +----
 cpp/include/raft/linalg/init.hpp              |  18 +--
 cpp/include/raft/linalg/lanczos.hpp           |   2 +-
 cpp/include/raft/linalg/map.hpp               |   2 +-
 cpp/include/raft/linalg/map_then_reduce.hpp   |   2 +-
 cpp/include/raft/linalg/matrix_vector_op.hpp  |   2 +-
 .../raft/linalg/mean_squared_error.hpp        |  10 +-
 cpp/include/raft/linalg/multiply.hpp          |   7 +-
 cpp/include/raft/linalg/norm.hpp              |  72 +---------
 cpp/include/raft/linalg/qr.hpp                |   2 +-
 cpp/include/raft/linalg/reduce.hpp            |  17 +--
 cpp/include/raft/linalg/strided_reduction.hpp |   2 +-
 cpp/include/raft/linalg/subtract.hpp          |   8 +-
 cpp/include/raft/linalg/svd.hpp               |  21 +--
 cpp/include/raft/linalg/transpose.hpp         |  45 +-----
 cpp/include/raft/linalg/unary_op.hpp          |   2 +-
 cpp/include/raft/matrix/matrix.hpp            |   5 +-
 .../sparse/distance/detail/bin_distance.cuh   |   2 +-
 .../sparse/distance/detail/ip_distance.cuh    |   2 +-
 .../sparse/distance/detail/l2_distance.cuh    |   2 +-
 .../sparse/distance/detail/lp_distance.cuh    |   2 +-
 cpp/include/raft/sparse/distance/distance.hpp |   2 +-
 .../hierarchy/detail/connectivities.cuh       |   2 +-
 .../raft/sparse/selection/detail/knn.cuh      |   2 +-
 .../sparse/selection/detail/knn_graph.cuh     |   2 +-
 cpp/include/raft/sparse/selection/knn.hpp     |   2 +-
 .../raft/sparse/selection/knn_graph.hpp       |   2 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   2 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   2 +-
 .../raft/spatial/knn/ball_cover_common.h      |   2 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   2 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   2 +-
 .../spatial/knn/detail/haversine_distance.cuh |   2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   2 +-
 .../raft/spatial/knn/detail/processing.hpp    |   2 +-
 cpp/include/raft/spectral/kmeans.hpp          |  35 ++---
 cpp/include/raft/spectral/lapack.hpp          |   4 +-
 cpp/include/raft/spectral/matrix_wrappers.hpp |  13 +-
 .../raft/spectral/modularity_maximization.hpp |   4 +-
 cpp/include/raft/spectral/spectral_util.hpp   |  48 ++++---
 cpp/test/linalg/cholesky_r1.cu                |  24 ++--
 cpp/test/linalg/reduce.cuh                    |   5 +-
 cpp/test/sparse/connect_components.cu         |   2 +-
 cpp/test/sparse/dist_coo_spmv.cu              |   2 +-
 cpp/test/sparse/distance.cu                   |   2 +-
 cpp/test/sparse/knn.cu                        |   2 +-
 cpp/test/sparse/linkage.cu                    |   2 +-
 cpp/test/spatial/ball_cover.cu                |   2 +-
 cpp/test/spatial/fused_l2_knn.cu              |   2 +-
 cpp/test/spatial/haversine.cu                 |   2 +-
 cpp/test/spatial/knn.cu                       |   2 +-
 82 files changed, 967 insertions(+), 461 deletions(-)
 rename cpp/include/raft/{linalg => distance}/distance_type.hpp (97%)
 rename cpp/include/raft/linalg/{ => detail}/cublas_wrappers.hpp (95%)
 rename cpp/include/raft/linalg/{ => detail}/cusolver_wrappers.hpp (99%)
 create mode 100644 cpp/include/raft/linalg/detail/divide.hpp
 create mode 100644 cpp/include/raft/linalg/detail/eltwise.hpp
 create mode 100644 cpp/include/raft/linalg/detail/gemv.hpp
 create mode 100644 cpp/include/raft/linalg/detail/init.hpp
 create mode 100644 cpp/include/raft/linalg/detail/mean_squared_error.hpp
 create mode 100644 cpp/include/raft/linalg/detail/multiply.hpp
 create mode 100644 cpp/include/raft/linalg/detail/norm.hpp
 create mode 100644 cpp/include/raft/linalg/detail/reduce.hpp
 create mode 100644 cpp/include/raft/linalg/detail/transpose.hpp

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index a004d24ae8..45850de115 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -30,7 +30,7 @@
 #include <raft/distance/detail/l1.cuh>
 #include <raft/distance/detail/minkowski.cuh>
 #include <raft/distance/detail/russell_rao.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 745d1fea90..935cf6677a 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp
similarity index 97%
rename from cpp/include/raft/linalg/distance_type.hpp
rename to cpp/include/raft/distance/distance_type.hpp
index 681a83f3f8..7a15c97f48 100644
--- a/cpp/include/raft/linalg/distance_type.hpp
+++ b/cpp/include/raft/distance/distance_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index fb45fa13a8..d1b0e35260 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -34,8 +34,8 @@
 
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/comms/comms.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 #include "cudart_utils.h"
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 2a59339c20..08496eef0d 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,6 @@
 #pragma once
 
 #include "detail/add.cuh"
-#include "detail/functional.cuh"
-
-#include "binary_op.hpp"
-#include "unary_op.hpp"
 
 namespace raft {
 namespace linalg {
@@ -44,7 +40,7 @@ using detail::adds_scalar;
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
+  detail::addScalar(out, in, scalar, len, stream);
 }
 
 /**
@@ -63,7 +59,7 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
+  detail::add(out, in1, in2, len, stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index e482240b59..12afcbcd9a 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 2428972d85..9dc9630a86 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index a8f19f61b1..00ac7b4be9 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 8459f7924d..7924f11e90 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -16,12 +16,28 @@
 
 #pragma once
 
+#include "functional.cuh"
+
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace linalg {
 namespace detail {
 
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
+}
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
+}
+
 template <class math_t, typename IdxType>
 __global__ void add_dev_scalar_kernel(math_t* outDev,
                                       const math_t* inDev,
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
index 641b38ff40..45f76660e8 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -19,8 +19,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
 
 namespace raft {
 namespace linalg {
@@ -76,35 +76,32 @@ void choleskyRank1Update(const raft::handle_t& handle,
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
     A_new = reinterpret_cast<math_t*>(workspace);
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
+    RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
   }
   cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                             CUBLAS_SIDE_LEFT,
-                                             uplo,
-                                             op,
-                                             CUBLAS_DIAG_NON_UNIT,
-                                             n - 1,
-                                             1,
-                                             &alpha,
-                                             L,
-                                             ld,
-                                             A_new,
-                                             n - 1,
-                                             stream));
+    RAFT_CUBLAS_TRY(cublastrsm(handle.get_cublas_handle(),
+                               CUBLAS_SIDE_LEFT,
+                               uplo,
+                               op,
+                               CUBLAS_DIAG_NON_UNIT,
+                               n - 1,
+                               1,
+                               &alpha,
+                               L,
+                               ld,
+                               A_new,
+                               n - 1,
+                               stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
+    RAFT_CUBLAS_TRY(cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      RAFT_CUBLAS_TRY(
-        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
+      RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
     RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
diff --git a/cpp/include/raft/linalg/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
similarity index 95%
rename from cpp/include/raft/linalg/cublas_wrappers.hpp
rename to cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 024ed4a0e2..5c8779b0cf 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,6 +117,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * @defgroup Axpy cublas ax+y operations
@@ -142,7 +143,6 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
@@ -156,7 +156,6 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
 /** @} */
@@ -173,7 +172,6 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
@@ -181,7 +179,6 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
 
@@ -199,14 +196,12 @@ template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
 /** @} */
@@ -245,7 +240,6 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
@@ -264,7 +258,6 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
@@ -298,7 +291,6 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
@@ -315,7 +307,6 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 /** @} */
@@ -358,7 +349,6 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -379,7 +369,6 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -425,7 +414,6 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgemmBatched(handle,
                             transa,
                             transb,
@@ -462,7 +450,6 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgemmBatched(handle,
                             transa,
                             transb,
@@ -529,7 +516,6 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -572,7 +558,6 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -619,7 +604,6 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -633,7 +617,6 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -662,7 +645,6 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -679,7 +661,6 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -720,7 +701,6 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -740,7 +720,6 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -783,7 +762,6 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
@@ -803,7 +781,6 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
@@ -844,7 +821,6 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -864,7 +840,6 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -901,7 +876,6 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
@@ -919,7 +893,6 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 /** @} */
@@ -936,7 +909,6 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
@@ -944,7 +916,6 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
@@ -979,7 +950,6 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -998,7 +968,6 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -1026,7 +995,6 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 float* result,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
@@ -1040,7 +1008,6 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 double* result,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
 /** @} */
@@ -1061,7 +1028,6 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
                                            cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
 /** @} */
@@ -1078,7 +1044,6 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
@@ -1086,11 +1051,11 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
 
 /** @} */
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
similarity index 99%
rename from cpp/include/raft/linalg/cusolver_wrappers.hpp
rename to cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 988e7512d5..2ff6825ea9 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,6 +115,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * @defgroup Getrf cusolver getrf operations
@@ -441,7 +442,6 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
 }
 /** @} */
 
-#if CUDART_VERSION >= 10010
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
@@ -575,7 +575,6 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 /** @} */
-#endif
 
 /**
  * @defgroup svd cusolver svd operations
@@ -1509,5 +1508,6 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
 /** @} */
 #endif
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp
new file mode 100644
index 0000000000..579a3317d6
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/divide.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/unary_op.hpp>
+#include "functional.cuh"
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, typename IdxType = int>
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, divides_scalar<math_t>(scalar), stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index 6475ce969b..859f058441 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -20,10 +20,10 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include "cusolver_wrappers.hpp"
 
 namespace raft {
 namespace linalg {
@@ -137,8 +137,6 @@ void eigDC(const raft::handle_t& handle,
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
 
-#if CUDART_VERSION >= 10010
-
 /**
  * @defgroup eig decomp with divide and conquer method for the column-major
  * symmetric matrices
@@ -244,8 +242,6 @@ void eigSelDC(const raft::handle_t& handle,
   }
 }
 
-#endif
-
 template <typename math_t>
 void eigJacobi(const raft::handle_t& handle,
                const math_t* in,
diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp
new file mode 100644
index 0000000000..e60c97e0e6
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/eltwise.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, multiplies_scalar<InType, OutType>(scalar), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 0954097b80..43f275ab51 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -19,7 +19,7 @@
 #include <cublas_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
+#include "cublas_wrappers.hpp"
 
 namespace raft {
 namespace linalg {
@@ -52,6 +52,25 @@ void gemm(const raft::handle_t& handle,
     cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+}
+
 template <typename T>
 void gemm(const raft::handle_t& handle,
           T* z,
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
new file mode 100644
index 0000000000..b31fa71237
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/cuda_utils.cuh>
+#include "cublas_wrappers.hpp"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  RAFT_CUBLAS_TRY(
+    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  RAFT_CUBLAS_TRY(
+    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+}
+
+};  // namespace detail
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp
new file mode 100644
index 0000000000..9bae9533ea
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/init.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <rmm/exec_policy.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  thrust::counting_iterator<int> first(start);
+  thrust::counting_iterator<int> last = first + (end - start);
+  thrust::device_ptr<T> ptr(out);
+  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  range(out, 0, n, stream);
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index f7052eef14..b08e95c760 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -27,15 +27,15 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
 #include <raft/spectral/lapack.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
+#include "cublas_wrappers.hpp"
 
 namespace raft {
 
 using namespace matrix;
-using namespace linalg;
+using namespace linalg::detail;
 
 namespace spectral {
 namespace detail {
diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
new file mode 100644
index 0000000000..2ef9479b87
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/map_then_reduce.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, int TPB = 256>
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
+  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
+    math_t diff = a - b;
+    return diff * diff * weight / len;
+  };
+  raft::linalg::mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp
new file mode 100644
index 0000000000..2cd83920c5
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/multiply.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, typename IdxType = int>
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp
new file mode 100644
index 0000000000..492f34e59d
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/norm.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/reduce.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+/** different types of norms supported on the input buffers */
+enum NormType { L1Norm = 0, L2Norm };
+
+template <typename Type, typename IdxType, typename Lambda>
+void rowNormCaller(Type* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   NormType type,
+                   bool rowMajor,
+                   cudaStream_t stream,
+                   Lambda fin_op)
+{
+  switch (type) {
+    case L1Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           true,
+                           stream,
+                           false,
+                           raft::L1Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    case L2Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           true,
+                           stream,
+                           false,
+                           raft::L2Op<Type>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+template <typename Type, typename IdxType, typename Lambda>
+void colNormCaller(Type* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   NormType type,
+                   bool rowMajor,
+                   cudaStream_t stream,
+                   Lambda fin_op)
+{
+  switch (type) {
+    case L1Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           false,
+                           stream,
+                           false,
+                           raft::L1Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    case L2Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           false,
+                           stream,
+                           false,
+                           raft::L2Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 8dc46eeb9b..0614af4aec 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
 
 namespace raft {
 namespace linalg {
@@ -50,10 +50,7 @@ void qrGetQ(const raft::handle_t& handle,
   rmm::device_uvector<math_t> workspace(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(
     cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
-  /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
-#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-#endif
+
   RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(
@@ -95,10 +92,6 @@ void qrGetQR(const raft::handle_t& handle,
                                     Lwork,
                                     devInfo.data(),
                                     stream));
-  // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
-#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-#endif
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp
new file mode 100644
index 0000000000..181a7d52b1
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+            ReduceLambda reduce_op = raft::Sum<OutType>(),
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
+  if (rowMajor && alongRows) {
+    raft::linalg::coalescedReduction(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (rowMajor && !alongRows) {
+    raft::linalg::stridedReduction(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (!rowMajor && alongRows) {
+    raft::linalg::stridedReduction(
+      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    raft::linalg::coalescedReduction(
+      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index a2e91a381a..26fe258825 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -24,6 +24,20 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
+  raft::linalg::unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+}
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
+  raft::linalg::binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+}
+
 template <class math_t, typename IdxType>
 __global__ void subtract_dev_scalar_kernel(math_t* outDev,
                                            const math_t* inDev,
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 81bfa06f27..0d1d128f6f 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -19,8 +19,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
 #include <raft/linalg/eig.hpp>
 #include <raft/linalg/gemm.hpp>
 #include <raft/linalg/transpose.hpp>
@@ -28,6 +26,8 @@
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
 
 namespace raft {
 namespace linalg {
@@ -49,15 +49,6 @@ void svdQR(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH       = handle.get_cublas_handle();
 
-#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
-  // 46340: sqrt of max int value
-  ASSERT(n_rows <= 46340,
-         "svd solver is not supported for the data that has more than 46340 "
-         "samples (rows) "
-         "if you are using CUDA version <11. Please use other solvers such as "
-         "eig if it is available.");
-#endif
-
   const int m = n_rows;
   const int n = n_cols;
 
@@ -200,44 +191,75 @@ void svdJacobi(const raft::handle_t& handle,
   int lwork = 0;
   int econ  = 1;
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
-                                                              CUSOLVER_EIG_MODE_VECTOR,
-                                                              econ,
-                                                              m,
-                                                              n,
-                                                              in,
-                                                              m,
-                                                              sing_vals,
-                                                              left_sing_vecs,
-                                                              m,
-                                                              right_sing_vecs,
-                                                              n,
-                                                              &lwork,
-                                                              gesvdj_params));
+  RAFT_CUSOLVER_TRY(cusolverDngesvdj_bufferSize(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                econ,
+                                                m,
+                                                n,
+                                                in,
+                                                m,
+                                                sing_vals,
+                                                left_sing_vecs,
+                                                m,
+                                                right_sing_vecs,
+                                                n,
+                                                &lwork,
+                                                gesvdj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH,
-                                                   CUSOLVER_EIG_MODE_VECTOR,
-                                                   econ,
-                                                   m,
-                                                   n,
-                                                   in,
-                                                   m,
-                                                   sing_vals,
-                                                   left_sing_vecs,
-                                                   m,
-                                                   right_sing_vecs,
-                                                   n,
-                                                   d_work.data(),
-                                                   lwork,
-                                                   devInfo.data(),
-                                                   gesvdj_params,
-                                                   stream));
+  RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH,
+                                     CUSOLVER_EIG_MODE_VECTOR,
+                                     econ,
+                                     m,
+                                     n,
+                                     in,
+                                     m,
+                                     sing_vals,
+                                     left_sing_vecs,
+                                     m,
+                                     right_sing_vecs,
+                                     n,
+                                     d_work.data(),
+                                     lwork,
+                                     devInfo.data(),
+                                     gesvdj_params,
+                                     stream));
 
   RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
+template <typename math_t>
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
+  const math_t alpha = 1.0, beta = 0.0;
+  rmm::device_uvector<math_t> SVT(k * n_cols, stream);
+
+  raft::linalg::gemm(
+    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle,
+                     U,
+                     n_rows,
+                     k,
+                     SVT.data(),
+                     out,
+                     n_rows,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     stream);
+}
+
 template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t& handle,
                          math_t* A_d,
@@ -275,20 +297,20 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           m,
-                                           n,
-                                           &alpha,
-                                           A_d,
-                                           m,
-                                           &beta,
-                                           P_d.data(),
-                                           m,
-                                           A_minus_P.data(),
-                                           m,
-                                           stream));
+  RAFT_CUBLAS_TRY(cublasgeam(cublasH,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             m,
+                             n,
+                             &alpha,
+                             A_d,
+                             m,
+                             &beta,
+                             P_d.data(),
+                             m,
+                             A_minus_P.data(),
+                             m,
+                             stream));
 
   math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
   math_t percent_error  = 100.0 * norm_A_minus_P / normA;
diff --git a/cpp/include/raft/linalg/detail/transpose.hpp b/cpp/include/raft/linalg/detail/transpose.hpp
new file mode 100644
index 0000000000..b55843bd96
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/transpose.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
+#include "cublas_wrappers.hpp"
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+  int out_n_rows = n_cols;
+  int out_n_cols = n_rows;
+
+  const math_t alpha = 1.0;
+  const math_t beta  = 0.0;
+  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
+                             out_n_rows,
+                             out_n_cols,
+                             &alpha,
+                             in,
+                             n_rows,
+                             &beta,
+                             out,
+                             out_n_rows,
+                             out,
+                             out_n_rows,
+                             stream));
+}
+
+template <typename math_t>
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  auto m        = n;
+  auto size     = n * n;
+  auto d_inout  = inout;
+  auto counting = thrust::make_counting_iterator<int>(0);
+
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) {
+    int s_row = idx % m;
+    int s_col = idx / m;
+    int d_row = s_col;
+    int d_col = s_row;
+    if (s_row < s_col) {
+      auto temp                  = d_inout[d_col * m + d_row];
+      d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
+      d_inout[s_col * m + s_row] = temp;
+    }
+  });
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index ecf0d3a48d..a93ffe64fc 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,7 @@
 
 #pragma once
 
-#include "detail/functional.cuh"
-#include "unary_op.hpp"
+#include "detail/divide.hpp"
 
 namespace raft {
 namespace linalg {
@@ -38,7 +37,7 @@ using detail::divides_scalar;
 template <typename math_t, typename IdxType = int>
 void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(out, in, len, divides_scalar<math_t>(scalar), stream);
+  detail::divideScalar(out, in, scalar, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 91a475f25f..1680e58cbf 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,8 +50,6 @@ using detail::COPY_INPUT;
 using detail::EigVecMemUsage;
 using detail::OVERWRITE_INPUT;
 
-#if CUDART_VERSION >= 10010
-
 /**
  * @defgroup eig decomp with divide and conquer method for the column-major
  * symmetric matrices
@@ -80,8 +78,6 @@ void eigSelDC(const raft::handle_t& handle,
   detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
 }
 
-#endif
-
 /**
  * @defgroup overloaded function for eig decomp with Jacobi method for the
  * column-major symmetric matrices (in parameter)
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 5a5b5c647b..930a125be7 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,7 @@
 
 #pragma once
 
-#include "detail/functional.cuh"
-
-#include "binary_op.hpp"
-#include "unary_op.hpp"
+#include "detail/eltwise.hpp"
 
 namespace raft {
 namespace linalg {
@@ -40,7 +37,7 @@ using detail::adds_scalar;
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar), stream);
+  detail::scalarAdd(out, in, scalar, len, stream);
 }
 
 using detail::multiplies_scalar;
@@ -48,7 +45,7 @@ using detail::multiplies_scalar;
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(out, in, len, multiplies_scalar<InType, OutType>(scalar), stream);
+  detail::scalarMultiply(out, in, scalar, len, stream);
 }
 /** @} */
 
@@ -67,28 +64,28 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseAdd(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
+  detail::eltwiseAdd(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseSub(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
+  detail::eltwiseSub(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseMultiply(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
+  detail::eltwiseMultiply(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivide(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
+  detail::eltwiseDivide(out, in1, in2, len, stream);
 }
 
 using detail::divides_check_zero;
@@ -97,7 +94,7 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
+  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 624aa7232b..19f79b2259 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,6 +58,22 @@ void gemm(const raft::handle_t& handle,
     handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param stream cuda stream
+ */
 template <typename math_t>
 void gemm(const raft::handle_t& handle,
           const math_t* a,
@@ -71,10 +87,7 @@ void gemm(const raft::handle_t& handle,
           cublasOperation_t trans_b,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemm(
-    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
 }
 
 /**
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 7dfd1f1db1..2a123e8895 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,7 @@
 
 #pragma once
 
-#include <cublas_v2.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.hpp>
-
-#include <raft/handle.hpp>
+#include "detail/gemv.hpp"
 
 namespace raft {
 namespace linalg {
@@ -39,10 +35,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  RAFT_CUBLAS_TRY(
-    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
+  detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream);
 }
 
 /**
@@ -53,10 +46,6 @@ void gemv(const raft::handle_t& handle,
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
  *
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
  *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
@@ -73,7 +62,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
 }
 
 /**
@@ -98,10 +87,7 @@ void gemv(const raft::handle_t& handle,
           const bool trans_a,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-
-  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
 }
 
 /**
@@ -137,10 +123,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  RAFT_CUBLAS_TRY(
-    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
 }
 
 /**
@@ -171,9 +154,7 @@ void gemv(const raft::handle_t& handle,
           const bool trans_a,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
 }
 
 };  // namespace linalg
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index 41ef4d4641..10498363e7 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,16 +16,11 @@
 
 #pragma once
 
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <rmm/exec_policy.hpp>
+#include "detail/init.hpp"
 
 namespace raft {
 namespace linalg {
 
-namespace {
-
 /**
  * @brief Like Python range.
  *
@@ -39,10 +34,7 @@ namespace {
 template <typename T>
 void range(T* out, int start, int end, cudaStream_t stream)
 {
-  thrust::counting_iterator<int> first(start);
-  thrust::counting_iterator<int> last = first + (end - start);
-  thrust::device_ptr<T> ptr(out);
-  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
+  detail::range(out, start, end, stream);
 }
 
 /**
@@ -57,8 +49,8 @@ void range(T* out, int start, int end, cudaStream_t stream)
 template <typename T, int TPB = 256>
 void range(T* out, int n, cudaStream_t stream)
 {
-  range(out, 0, n, stream);
+  detail::range(out, n, stream);
 }
-}  // unnamed namespace
+
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 34db473edb..43164b676a 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index c14fb7ba2b..1c4b6816ae 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index d4d7087339..48c0318798 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index f088ef4dce..a8a805b4c2 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index f6318e1754..3a97a4396e 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "map_then_reduce.hpp"
+#include "detail/mean_squared_error.hpp"
 
 namespace raft {
 namespace linalg {
@@ -36,11 +36,7 @@ template <typename math_t, int TPB = 256>
 void meanSquaredError(
   math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
 {
-  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
-    math_t diff = a - b;
-    return diff * diff * weight / len;
-  };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+  detail::meanSquaredError(out, A, B, len, weight, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index 66566692d5..edc84f2bcf 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "unary_op.hpp"
+#include "detail/multiply.hpp"
 
 namespace raft {
 namespace linalg {
@@ -35,8 +35,7 @@ namespace linalg {
 template <typename math_t, typename IdxType = int>
 void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+  detail::multiplyScalar(out, in, scalar, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index 5b0de91513..79b060454b 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,15 @@
 
 #pragma once
 
-#include "reduce.hpp"
+#include "detail/norm.hpp"
 
 namespace raft {
 namespace linalg {
 
 /** different types of norms supported on the input buffers */
-enum NormType { L1Norm = 0, L2Norm };
+using detail::L1Norm;
+using detail::L2Norm;
+using detail::NormType;
 
 /**
  * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
@@ -54,37 +56,7 @@ void rowNorm(Type* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::Nop<Type, IdxType>())
 {
-  switch (type) {
-    case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L2Op<Type>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
-  };
+  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
 /**
@@ -111,37 +83,7 @@ void colNorm(Type* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::Nop<Type, IdxType>())
 {
-  switch (type) {
-    case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L2Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
-  };
+  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index b0e9eed5e2..fb1c6be8be 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index 339245e946..ca9ad34dc8 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include "coalesced_reduction.hpp"
-#include "strided_reduction.hpp"
+#include "detail/reduce.hpp"
 
 namespace raft {
 namespace linalg {
@@ -71,15 +69,8 @@ void reduce(OutType* dots,
             ReduceLambda reduce_op = raft::Sum<OutType>(),
             FinalLambda final_op   = raft::Nop<OutType>())
 {
-  if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
-  }
+  detail::reduce(
+    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 7e2b5229ec..f8c37f07e0 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 88946646c8..716db1a195 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,8 +38,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  detail::subtractScalar(out, in, scalar, len, stream);
 }
 
 /**
@@ -58,8 +57,7 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+  detail::subtract(out, in1, in2, len, stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 62ac19b592..331796c2ca 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,24 +148,7 @@ void svdReconstruction(const raft::handle_t& handle,
                        int k,
                        cudaStream_t stream)
 {
-  const math_t alpha = 1.0, beta = 0.0;
-  rmm::device_uvector<math_t> SVT(k * n_cols, stream);
-
-  raft::linalg::gemm(
-    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle,
-                     U,
-                     n_rows,
-                     k,
-                     SVT.data(),
-                     out,
-                     n_rows,
-                     n_cols,
-                     CUBLAS_OP_N,
-                     CUBLAS_OP_N,
-                     alpha,
-                     beta,
-                     stream);
+  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
 }
 
 /**
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index 09e9e67e7b..4c16df331c 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
-#include <rmm/exec_policy.hpp>
+#include "detail/transpose.hpp"
 
 namespace raft {
 namespace linalg {
@@ -40,27 +38,7 @@ void transpose(const raft::handle_t& handle,
                int n_cols,
                cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-
-  int out_n_rows = n_cols;
-  int out_n_cols = n_rows;
-
-  const math_t alpha = 1.0;
-  const math_t beta  = 0.0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           out_n_rows,
-                                           out_n_cols,
-                                           &alpha,
-                                           in,
-                                           n_rows,
-                                           &beta,
-                                           out,
-                                           out_n_rows,
-                                           out,
-                                           out_n_rows,
-                                           stream));
+  detail::transpose(handle, in, out, n_rows, n_cols, stream);
 }
 
 /**
@@ -72,22 +50,7 @@ void transpose(const raft::handle_t& handle,
 template <typename math_t>
 void transpose(math_t* inout, int n, cudaStream_t stream)
 {
-  auto m        = n;
-  auto size     = n * n;
-  auto d_inout  = inout;
-  auto counting = thrust::make_counting_iterator<int>(0);
-
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) {
-    int s_row = idx % m;
-    int s_col = idx / m;
-    int d_row = s_col;
-    int d_col = s_row;
-    if (s_row < s_col) {
-      auto temp                  = d_inout[d_col * m + d_row];
-      d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
-      d_inout[s_col * m + s_row] = temp;
-    }
-  });
+  detail::transpose(inout, n, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index c54e3cc1c3..a7753ccff7 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 00651a9b62..42be8d7bab 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -24,7 +24,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 namespace raft {
 namespace matrix {
@@ -285,7 +285,8 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 141e5b3e5f..21faffc17d 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -23,7 +23,7 @@
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 0f8b2d99bb..63c39457af 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 62bfb7671e..ef578f0cf1 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -23,7 +23,7 @@
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index d062705b57..78f131b04b 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -21,7 +21,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index c49730bdb9..8a709ae5ea 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -21,7 +21,7 @@
 
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/detail/utils.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 0c47b22201..e184d2be6e 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -24,7 +24,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/hierarchy/common.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index 21a40cf626..947610d8cf 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index c96fefdc5d..c1f98eae12 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -24,7 +24,7 @@
 
 #include <raft/spatial/knn/knn.hpp>
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index bfc0c14a8c..bb5edd2f17 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.hpp>
 #include <raft/sparse/selection/detail/knn.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 2a3159900c..357a65447c 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/detail/knn_graph.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index e2df51a62b..ce1385e688 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/StandardGpuResources.h>
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index 4495221a34..23191f9415 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -19,7 +19,7 @@
 #include <cstdint>
 
 #include <thrust/transform.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include "ball_cover_common.h"
 #include "detail/ball_cover.cuh"
 #include "detail/ball_cover/common.cuh"
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index 9ed1d2f726..e1a202107b 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cstdint>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 6f223fdb43..b5d5e48231 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -43,7 +43,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 
 #include <cuml/neighbors/knn.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 3708523b4f..bf10356bfa 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -20,7 +20,7 @@
 #include <raft/cuda_utils.cuh>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 50340a284b..8faf76f096 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -26,8 +26,8 @@
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 54509b4a51..04b2dc3098 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -32,8 +32,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <cstdint>
 #include <iostream>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.hpp>
 #include <set>
 
 #include "fused_l2_knn.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 5a4672e711..a515ca8507 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/linalg/norm.hpp>
 #include <raft/linalg/unary_op.hpp>
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index cbd0486086..56f4022a8c 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -31,7 +31,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
 
@@ -657,20 +657,21 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                             CUBLAS_OP_T,
-                             CUBLAS_OP_N,
-                             n,
-                             d,
-                             &one,
-                             obs,
-                             d,
-                             &zero,
-                             (value_type_t*)NULL,
-                             n,
-                             thrust::raw_pointer_cast(obs_copy),
-                             n,
-                             stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_OP_N,
+                                                   n,
+                                                   d,
+                                                   &one,
+                                                   obs,
+                                                   d,
+                                                   &zero,
+                                                   (value_type_t*)NULL,
+                                                   n,
+                                                   thrust::raw_pointer_cast(obs_copy),
+                                                   n,
+                                                   stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
@@ -852,7 +853,9 @@ int kmeans(handle_t const& handle,
   }
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index a47c41564c..d066c68a68 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -18,8 +18,8 @@
 #include <cusolverDn.h>
 
 #include <raft/error.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 
 // for now; TODO: check if/where this `define` should be;
 //
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 6f9d383c63..d463b1b590 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/fill.h>
@@ -349,7 +349,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     if (beta == 0) {
       CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
-      RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream));
+      // TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
     }
 
     // Apply diagonal matrix
@@ -412,7 +413,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h,
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasdot(cublas_h,
                                       n,
                                       laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
                                       1,
@@ -424,7 +427,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h,
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasaxpy(cublas_h,
                                        n,
                                        &gamma_,
                                        laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index c61b5f1458..8188a772b8 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -160,7 +160,9 @@ void analyzeModularity(handle_t const& handle,
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  // #TODO: Use public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index a30906de10..6b57566a73 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -18,6 +18,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -132,7 +133,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::minus<weight_t>());
     RAFT_CHECK_CUDA(stream);
 
-    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -149,22 +152,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                               CUBLAS_OP_T,
-                               CUBLAS_OP_N,
-                               nEigVecs,
-                               n,
-                               &one,
-                               eigVecs,
-                               n,
-                               &zero,
-                               (weight_t*)NULL,
-                               nEigVecs,
-                               work.raw(),
-                               nEigVecs,
-                               stream));
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     nEigVecs,
+                                                     n,
+                                                     &one,
+                                                     eigVecs,
+                                                     n,
+                                                     &zero,
+                                                     (weight_t*)NULL,
+                                                     nEigVecs,
+                                                     work.raw(),
+                                                     nEigVecs,
+                                                     stream));
 
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
@@ -216,14 +222,18 @@ bool construct_indicator(handle_t const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+    cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
   if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 85f5d7ada1..6583d8d23c 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.hpp>
-#include <raft/linalg/cusolver_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -42,7 +42,8 @@ class CholeskyR1Test : public ::testing::Test {
 
     // Allocate workspace
     solver_handle = handle.get_cusolver_dn_handle();
-    RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize(
+    // TODO: Call from public API when ready
+    RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
       solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork));
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
@@ -72,15 +73,16 @@ class CholeskyR1Test : public ::testing::Test {
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, handle.get_stream());
-        RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle,
-                                                        uplo,
-                                                        rank,
-                                                        L_exp.data(),
-                                                        n_rows,
-                                                        (math_t*)workspace.data(),
-                                                        Lwork,
-                                                        devInfo.data(),
-                                                        handle.get_stream()));
+        // TODO: Call from public API when ready
+        RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(solver_handle,
+                                                                uplo,
+                                                                rank,
+                                                                L_exp.data(),
+                                                                n_rows,
+                                                                (math_t*)workspace.data(),
+                                                                Lwork,
+                                                                devInfo.data(),
+                                                                handle.get_stream()));
 
         // Incremental Cholesky factorization using rank one updates.
         raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index aae57e136e..c5728cd8d4 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -18,7 +18,7 @@
 
 #include <cublas_v2.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -65,7 +65,8 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t
   raft::linalg::unaryOp<OutType>(
     ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(
     handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   RAFT_CUBLAS_TRY(cublasDestroy(handle));
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index 2c56a902d4..850ecd72c8 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -26,7 +26,7 @@
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/selection/knn_graph.hpp>
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/linalg/transpose.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index eae8fec500..6faedfa137 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index d635c4a813..39b936573f 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 
 #include <raft/sparse/distance/distance.hpp>
 
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 6e4de63e4d..c5858610da 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/selection/knn.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 51947167cf..cb09b9e7f5 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/linalg/transpose.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index d63674c13c..fb5e72141b 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -15,7 +15,7 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 303844b0a4..40c16eed09 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -20,7 +20,7 @@
 #include <faiss/gpu/StandardGpuResources.h>
 
 #include <raft/spatial/knn/detail/common_faiss.h>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/knn.hpp>
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index d28fd55dbe..3b720de505 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 839d60095e..ecd76e5598 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 
-#include <raft/linalg/distance_type.hpp>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
 #include <rmm/device_buffer.hpp>

From 34b24396c43af190214903427c6d4197dae5d531 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 21 Dec 2021 19:32:36 -0800
Subject: [PATCH 09/17] correcting doxygen build

---
 cpp/include/raft/linalg/detail/eig.hpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index a74384e479..c04a939652 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -139,20 +139,6 @@ void eigDC(const raft::handle_t& handle,
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
 
-/**
- * @defgroup eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param n_eig_vals: number of eigenvectors to be generated
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void eigSelDC(const raft::handle_t& handle,
               math_t* in,

From 897e6f7f7745cd63658607941239e1b71527a69e Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 22 Dec 2021 12:15:02 -0800
Subject: [PATCH 10/17] correcting wrong docs

---
 cpp/include/raft/linalg/eig.hpp  | 18 +++++++++-----
 cpp/include/raft/linalg/gemv.hpp | 40 +++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 1680e58cbf..3ddf5419b9 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -22,7 +22,12 @@ namespace raft {
 namespace linalg {
 
 /**
- * @defgroup eig decomp with divide and conquer method for the column-major
+ * @defgroup eig Eigen Decomposition Methods
+ * @{
+ */
+
+/**
+ * @brief eig decomp with divide and conquer method for the column-major
  * symmetric matrices
  * @param handle raft handle
  * @param in the input buffer (symmetric matrix that has real eig values and
@@ -32,7 +37,6 @@ namespace linalg {
  * @param eig_vectors: eigenvectors
  * @param eig_vals: eigen values
  * @param stream cuda stream
- * @{
  */
 template <typename math_t>
 void eigDC(const raft::handle_t& handle,
@@ -51,7 +55,7 @@ using detail::EigVecMemUsage;
 using detail::OVERWRITE_INPUT;
 
 /**
- * @defgroup eig decomp with divide and conquer method for the column-major
+ * @brief eig sel decomp with divide and conquer method for the column-major
  * symmetric matrices
  * @param handle raft handle
  * @param in the input buffer (symmetric matrix that has real eig values and
@@ -61,8 +65,8 @@ using detail::OVERWRITE_INPUT;
  * @param n_eig_vals: number of eigenvectors to be generated
  * @param eig_vectors: eigenvectors
  * @param eig_vals: eigen values
+ * @param memUsage: the memory selection for eig vector output
  * @param stream cuda stream
- * @{
  */
 template <typename math_t>
 void eigSelDC(const raft::handle_t& handle,
@@ -79,18 +83,19 @@ void eigSelDC(const raft::handle_t& handle,
 }
 
 /**
- * @defgroup overloaded function for eig decomp with Jacobi method for the
+ * @brief overloaded function for eig decomp with Jacobi method for the
  * column-major symmetric matrices (in parameter)
  * @param handle: raft handle
+ * @param in: input matrix
  * @param n_rows: number of rows of the input
  * @param n_cols: number of cols of the input
  * @param eig_vectors: eigenvectors
  * @param eig_vals: eigen values
+ * @param stream: stream on which this function will be run
  * @param tol: error tolerance for the jacobi method. Algorithm stops when the
  * error is below tol
  * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
  * accuracy.
- * @{
  */
 template <typename math_t>
 void eigJacobi(const raft::handle_t& handle,
@@ -105,6 +110,7 @@ void eigJacobi(const raft::handle_t& handle,
 {
   detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
 }
+/** @} */  // end of eig
 
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 2a123e8895..dabb1f121a 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -43,12 +43,17 @@ void gemv(const raft::handle_t& handle,
  *
  * where
  *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -70,12 +75,15 @@ void gemv(const raft::handle_t& handle,
  *
  * where
  *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -94,21 +102,20 @@ void gemv(const raft::handle_t& handle,
  * y = alpha * op(A) * x + beta * y
  *
  * where
- *
- * @param alpha is a scalar scale of Ax.
- *
- * @param beta is a scalar scale of y.
- *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
  *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
  *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -130,17 +137,18 @@ void gemv(const raft::handle_t& handle,
  * y = op(A) * x
  *
  * where
- *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
  *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
  *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
  *
  */
 template <typename math_t>

From 3d4b5f1e4d30e44f46f855e6422a1e105d1fbd8f Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 11 Jan 2022 10:40:41 -0800
Subject: [PATCH 11/17] review feedback

---
 cpp/include/raft/linalg/add.hpp               |   2 +-
 cpp/include/raft/linalg/binary_op.hpp         |   2 +-
 .../raft/linalg/cholesky_r1_update.hpp        |   2 +-
 .../raft/linalg/coalesced_reduction.hpp       |   2 +-
 cpp/include/raft/linalg/contractions.hpp      |   2 +-
 cpp/include/raft/linalg/detail/add.cuh        |   2 +-
 cpp/include/raft/linalg/detail/binary_op.cuh  |   2 +-
 .../raft/linalg/detail/cholesky_r1_update.hpp |   2 +-
 .../linalg/detail/coalesced_reduction.cuh     |   2 +-
 .../raft/linalg/detail/contractions.cuh       |   2 +-
 .../raft/linalg/detail/cublas_wrappers.hpp    |   2 +-
 .../raft/linalg/detail/cusolver_wrappers.hpp  |   2 +-
 cpp/include/raft/linalg/detail/divide.hpp     |   2 +-
 cpp/include/raft/linalg/detail/eig.hpp        |   2 +-
 cpp/include/raft/linalg/detail/eltwise.hpp    |   2 +-
 cpp/include/raft/linalg/detail/functional.cuh |   2 +-
 cpp/include/raft/linalg/detail/gemm.hpp       |   2 +-
 cpp/include/raft/linalg/detail/gemv.hpp       |   2 +-
 cpp/include/raft/linalg/detail/init.hpp       |   2 +-
 cpp/include/raft/linalg/detail/lanczos.hpp    | 161 ++++++++++---
 cpp/include/raft/linalg/detail/map.cuh        |   2 +-
 .../raft/linalg/detail/map_then_reduce.cuh    |   2 +-
 .../raft/linalg/detail/matrix_vector_op.cuh   |   2 +-
 .../raft/linalg/detail/mean_squared_error.hpp |   2 +-
 cpp/include/raft/linalg/detail/multiply.hpp   |   2 +-
 cpp/include/raft/linalg/detail/norm.hpp       |   2 +-
 cpp/include/raft/linalg/detail/qr.cuh         |   2 +-
 cpp/include/raft/linalg/detail/reduce.hpp     |   2 +-
 .../raft/linalg/detail/strided_reduction.cuh  |   2 +-
 cpp/include/raft/linalg/detail/subtract.cuh   |   2 +-
 cpp/include/raft/linalg/detail/svd.hpp        |   2 +-
 cpp/include/raft/linalg/detail/transpose.hpp  |   2 +-
 cpp/include/raft/linalg/detail/unary_op.cuh   |   2 +-
 cpp/include/raft/linalg/divide.hpp            |   2 +-
 cpp/include/raft/linalg/eig.hpp               |   2 +-
 cpp/include/raft/linalg/eltwise.hpp           |   2 +-
 cpp/include/raft/linalg/gemm.hpp              |   2 +-
 cpp/include/raft/linalg/gemv.hpp              |   2 +-
 cpp/include/raft/linalg/init.hpp              |   2 +-
 cpp/include/raft/linalg/lanczos.hpp           | 217 ++----------------
 cpp/include/raft/linalg/map.hpp               |   2 +-
 cpp/include/raft/linalg/map_then_reduce.hpp   |   2 +-
 cpp/include/raft/linalg/matrix_vector_op.hpp  |   2 +-
 .../raft/linalg/mean_squared_error.hpp        |   2 +-
 cpp/include/raft/linalg/multiply.hpp          |   2 +-
 cpp/include/raft/linalg/norm.hpp              |   2 +-
 cpp/include/raft/linalg/qr.hpp                |   2 +-
 cpp/include/raft/linalg/reduce.hpp            |   2 +-
 cpp/include/raft/linalg/strided_reduction.hpp |   2 +-
 cpp/include/raft/linalg/subtract.hpp          |   2 +-
 cpp/include/raft/linalg/svd.hpp               |   2 +-
 cpp/include/raft/linalg/transpose.hpp         |   2 +-
 cpp/include/raft/linalg/unary_op.hpp          |   2 +-
 53 files changed, 201 insertions(+), 279 deletions(-)

diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 08496eef0d..2f999a45d2 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 12afcbcd9a..5c73b6d3c5 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 9dc9630a86..583c65c50e 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 00ac7b4be9..0f1ca9202d 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index ae6832bd7a..e317588b1d 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 7924f11e90..794a776dcf 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
index 7c9ba2aeed..6b1f8bc6d7 100644
--- a/cpp/include/raft/linalg/detail/binary_op.cuh
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
index 5f88c36a0c..d070e47b31 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index bb451bf13a..7e545e4932 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index d5dd416c49..40d0839f60 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 5c8779b0cf..83890f348a 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 2ff6825ea9..171227498d 100644
--- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp
index ad579a31f0..c694529fb5 100644
--- a/cpp/include/raft/linalg/detail/divide.hpp
+++ b/cpp/include/raft/linalg/detail/divide.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index c04a939652..a27a65efd8 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp
index e60c97e0e6..b15717f205 100644
--- a/cpp/include/raft/linalg/detail/eltwise.hpp
+++ b/cpp/include/raft/linalg/detail/eltwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
index 4cebd34d08..067b1565e0 100644
--- a/cpp/include/raft/linalg/detail/functional.cuh
+++ b/cpp/include/raft/linalg/detail/functional.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index ca01f20d8b..28f12084f7 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 246db6e58b..991268cf26 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp
index 5aa8d8247e..4718a2cb0e 100644
--- a/cpp/include/raft/linalg/detail/init.hpp
+++ b/cpp/include/raft/linalg/detail/init.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index c585e23a20..9ee2f6cdc9 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -700,6 +700,55 @@ static int lanczosRestart(handle_t const& handle,
 
 namespace detail {
 
+/**
+ * @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param shift On exit, pointer to matrix shift (estimate for
+ *    largest eigenvalue).
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(handle_t const& handle,
                                 sparse_matrix_t<index_type_t, value_type_t> const* A,
@@ -980,28 +1029,72 @@ int computeSmallestEigenvectors(handle_t const& handle,
   // Perform Lanczos method
   index_type_t effIter;
   value_type_t shift;
-  int status = raft::detail::computeSmallestEigenvectors(handle,
-                                                         &A,
-                                                         nEigVecs,
-                                                         maxIter,
-                                                         restartIter,
-                                                         tol,
-                                                         reorthogonalize,
-                                                         &effIter,
-                                                         &iter,
-                                                         &shift,
-                                                         alpha_host,
-                                                         beta_host,
-                                                         lanczosVecs_dev.raw(),
-                                                         work_dev.raw(),
-                                                         eigVals_dev,
-                                                         eigVecs_dev,
-                                                         seed);
+  int status = computeSmallestEigenvectors(handle,
+                                           &A,
+                                           nEigVecs,
+                                           maxIter,
+                                           restartIter,
+                                           tol,
+                                           reorthogonalize,
+                                           &effIter,
+                                           &iter,
+                                           &shift,
+                                           alpha_host,
+                                           beta_host,
+                                           lanczosVecs_dev.raw(),
+                                           work_dev.raw(),
+                                           eigVals_dev,
+                                           eigVecs_dev,
+                                           seed);
 
   // Clean up and return
   return status;
 }
 
+/**
+ *  @brief Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed.
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(handle_t const& handle,
                                sparse_matrix_t<index_type_t, value_type_t> const* A,
@@ -1282,22 +1375,22 @@ int computeLargestEigenvectors(handle_t const& handle,
 
   // Perform Lanczos method
   index_type_t effIter;
-  int status = raft::detail::computeLargestEigenvectors(handle,
-                                                        &A,
-                                                        nEigVecs,
-                                                        maxIter,
-                                                        restartIter,
-                                                        tol,
-                                                        reorthogonalize,
-                                                        &effIter,
-                                                        &iter,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        lanczosVecs_dev.raw(),
-                                                        work_dev.raw(),
-                                                        eigVals_dev,
-                                                        eigVecs_dev,
-                                                        seed);
+  int status = computeLargestEigenvectors(handle,
+                                          &A,
+                                          nEigVecs,
+                                          maxIter,
+                                          restartIter,
+                                          tol,
+                                          reorthogonalize,
+                                          &effIter,
+                                          &iter,
+                                          alpha_host,
+                                          beta_host,
+                                          lanczosVecs_dev.raw(),
+                                          work_dev.raw(),
+                                          eigVals_dev,
+                                          eigVecs_dev,
+                                          seed);
 
   // Clean up and return
   return status;
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 7f1ba3da0d..513432ef27 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 089bc627be..99e04d82e7 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index 972bd793ab..e7debb0bee 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
index 2ef9479b87..f0a9daebdb 100644
--- a/cpp/include/raft/linalg/detail/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp
index 2cd83920c5..da06c23aed 100644
--- a/cpp/include/raft/linalg/detail/multiply.hpp
+++ b/cpp/include/raft/linalg/detail/multiply.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp
index 492f34e59d..fcf98c7daf 100644
--- a/cpp/include/raft/linalg/detail/norm.hpp
+++ b/cpp/include/raft/linalg/detail/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 2abb61db59..a250dd3578 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp
index 181a7d52b1..94c8f5ba52 100644
--- a/cpp/include/raft/linalg/detail/reduce.hpp
+++ b/cpp/include/raft/linalg/detail/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index f9313088f9..a0d1e2abaa 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index 26fe258825..767373574b 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/svd.hpp b/cpp/include/raft/linalg/detail/svd.hpp
index df61d20274..796adc89ff 100644
--- a/cpp/include/raft/linalg/detail/svd.hpp
+++ b/cpp/include/raft/linalg/detail/svd.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/transpose.hpp b/cpp/include/raft/linalg/detail/transpose.hpp
index 9dda6e5991..659d3a8ef6 100644
--- a/cpp/include/raft/linalg/detail/transpose.hpp
+++ b/cpp/include/raft/linalg/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
index d419a9ed7b..9ddfe79657 100644
--- a/cpp/include/raft/linalg/detail/unary_op.cuh
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index a93ffe64fc..6c8480bf19 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 3ddf5419b9..5c465a3a41 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 930a125be7..5c2a97b57d 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 19f79b2259..04ddbb3561 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index dabb1f121a..e8d378c187 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index 10498363e7..bb577672e8 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 43164b676a..e7d965f810 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,93 +24,6 @@ namespace raft {
 // Eigensolver
 // =========================================================
 
-/**
- * @brief  Compute smallest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are smallest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the smallest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param effIter On exit, pointer to final size of Lanczos system.
- *  @param totalIter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param shift On exit, pointer to matrix shift (estimate for
- *    largest eigenvalue).
- *  @param alpha_host (Output, host memory, restartIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, restartIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (restartIter+1).
- *  @param work_dev (Output, device memory,
- *    (n+restartIter)*restartIter entries) Workspace.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to smallest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const* A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t* effIter,
-                                index_type_t* totalIter,
-                                value_type_t* shift,
-                                value_type_t* __restrict__ alpha_host,
-                                value_type_t* __restrict__ beta_host,
-                                value_type_t* __restrict__ lanczosVecs_dev,
-                                value_type_t* __restrict__ work_dev,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed)
-{
-  return raft::detail::computeSmallestEigenvectors(handle,
-                                                   A,
-                                                   nEigVecs,
-                                                   maxIter,
-                                                   restartIter,
-                                                   tol,
-                                                   reorthogonalize,
-                                                   effIter,
-                                                   totalIter,
-                                                   shift,
-                                                   alpha_host,
-                                                   beta_host,
-                                                   lanczosVecs_dev,
-                                                   work_dev,
-                                                   eigVals_dev,
-                                                   eigVecs_dev,
-                                                   seed);
-}
-
 /**
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
@@ -161,101 +74,17 @@ int computeSmallestEigenvectors(handle_t const& handle,
                                 value_type_t* __restrict__ eigVecs_dev,
                                 unsigned long long seed = 1234567)
 {
-  return raft::detail::computeSmallestEigenvectors(handle,
-                                                   A,
-                                                   nEigVecs,
-                                                   maxIter,
-                                                   restartIter,
-                                                   tol,
-                                                   reorthogonalize,
-                                                   iter,
-                                                   eigVals_dev,
-                                                   eigVecs_dev,
-                                                   seed);
-}
-
-// =========================================================
-// Eigensolver
-// =========================================================
-
-/**
- *  @brief Compute largest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are largest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the largest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th largest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param effIter On exit, pointer to final size of Lanczos system.
- *  @param totalIter On exit, pointer to total number of Lanczos
- *    iterations performed.
- *  @param alpha_host (Output, host memory, restartIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, restartIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (restartIter+1).
- *  @param work_dev (Output, device memory,
- *    (n+restartIter)*restartIter entries) Workspace.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to largest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const* A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t* effIter,
-                               index_type_t* totalIter,
-                               value_type_t* __restrict__ alpha_host,
-                               value_type_t* __restrict__ beta_host,
-                               value_type_t* __restrict__ lanczosVecs_dev,
-                               value_type_t* __restrict__ work_dev,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed)
-{
-  return raft::detail::computeLargestEigenvectors(handle,
-                                                  A,
-                                                  nEigVecs,
-                                                  maxIter,
-                                                  restartIter,
-                                                  tol,
-                                                  reorthogonalize,
-                                                  effIter,
-                                                  totalIter,
-                                                  alpha_host,
-                                                  beta_host,
-                                                  lanczosVecs_dev,
-                                                  work_dev,
-                                                  eigVals_dev,
-                                                  eigVecs_dev,
-                                                  seed);
+  return detail::computeSmallestEigenvectors(handle,
+                                             A,
+                                             nEigVecs,
+                                             maxIter,
+                                             restartIter,
+                                             tol,
+                                             reorthogonalize,
+                                             iter,
+                                             eigVals_dev,
+                                             eigVecs_dev,
+                                             seed);
 }
 
 /**
@@ -308,17 +137,17 @@ int computeLargestEigenvectors(handle_t const& handle,
                                value_type_t* __restrict__ eigVecs_dev,
                                unsigned long long seed = 123456)
 {
-  return raft::detail::computeLargestEigenvectors(handle,
-                                                  A,
-                                                  nEigVecs,
-                                                  maxIter,
-                                                  restartIter,
-                                                  tol,
-                                                  reorthogonalize,
-                                                  iter,
-                                                  eigVals_dev,
-                                                  eigVecs_dev,
-                                                  seed);
+  return detail::computeLargestEigenvectors(handle,
+                                            A,
+                                            nEigVecs,
+                                            maxIter,
+                                            restartIter,
+                                            tol,
+                                            reorthogonalize,
+                                            iter,
+                                            eigVals_dev,
+                                            eigVecs_dev,
+                                            seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index 1c4b6816ae..febeaa8621 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 48c0318798..04275995a0 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index a8a805b4c2..b9790ebce2 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 3a97a4396e..42af8642b6 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index edc84f2bcf..4a1628b44a 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index 79b060454b..a6336769ca 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index fb1c6be8be..50e97e4069 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index ca9ad34dc8..1c4ef70df8 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index f8c37f07e0..0f97323e5a 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 716db1a195..9d48948cad 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index c18c73eaed..a30180b174 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index 4c16df331c..50608877fa 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index a7753ccff7..51faa2e4a4 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From b6471d665d62787975e9872feaef3e7e978957a3 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 26 Jan 2022 15:06:22 -0800
Subject: [PATCH 12/17] review changes

---
 cpp/include/raft/distance/distance_type.hpp   |   2 +-
 .../raft/linalg/detail/cusolver_wrappers.hpp  |  30 --
 cpp/include/raft/linalg/detail/eig.hpp        |   2 -
 cpp/include/raft/linalg/detail/lanczos.hpp    | 403 +++++++++---------
 4 files changed, 203 insertions(+), 234 deletions(-)

diff --git a/cpp/include/raft/distance/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp
index 7a15c97f48..f75263b00d 100644
--- a/cpp/include/raft/distance/distance_type.hpp
+++ b/cpp/include/raft/distance/distance_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 171227498d..aac58547f8 100644
--- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -143,7 +143,6 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -158,7 +157,6 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -225,7 +223,6 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
@@ -242,7 +239,6 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
@@ -324,7 +320,6 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -343,7 +338,6 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -420,7 +414,6 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
@@ -437,7 +430,6 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
@@ -545,7 +537,6 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
@@ -570,7 +561,6 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
@@ -632,7 +622,6 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -656,7 +645,6 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -756,7 +744,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -780,7 +767,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -845,7 +831,6 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
@@ -860,7 +845,6 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 /** @} */
@@ -893,7 +877,6 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
@@ -909,7 +892,6 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 /** @} */
@@ -941,7 +923,6 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
@@ -956,7 +937,6 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
@@ -1023,7 +1003,6 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
@@ -1040,7 +1019,6 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
@@ -1121,7 +1099,6 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1143,7 +1120,6 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1310,7 +1286,6 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
   return cusolverSpScsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1332,7 +1307,6 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
   return cusolverSpDcsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1371,7 +1345,6 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1401,7 +1374,6 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1451,7 +1423,6 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
@@ -1487,7 +1458,6 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.hpp
index a27a65efd8..8716b4de29 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "cusolver_wrappers.hpp"
-
 #include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index 9ee2f6cdc9..c761c06c14 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -123,28 +123,28 @@ int performLanczosIteration(handle_t const& handle,
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
-                               lanczosVecs_dev,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
+      RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                                    lanczosVecs_dev,
+                                    n * sizeof(value_type_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(
+    RAFT_CUBLAS_TRY(cublasdot(
       cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(
+    RAFT_CUBLAS_TRY(cublasaxpy(
       cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
+    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
   }
 
   // -------------------------------------------------------
@@ -156,113 +156,113 @@ int performLanczosIteration(handle_t const& handle,
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
-                               lanczosVecs_dev + (*iter - 1) * n,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
+      RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                                    lanczosVecs_dev + (*iter - 1) * n,
+                                    n * sizeof(value_type_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
     A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
 
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_T,
-                              n,
-                              *iter,
-                              &one,
-                              lanczosVecs_dev,
-                              n,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              &zero,
-                              work_dev,
-                              1,
-                              stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_N,
-                              n,
-                              *iter,
-                              &negOne,
-                              lanczosVecs_dev,
-                              n,
-                              work_dev,
-                              1,
-                              &one,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
-                               work_dev + (*iter - 1),
-                               sizeof(value_type_t),
-                               cudaMemcpyDeviceToHost,
-                               stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_T,
-                              n,
-                              *iter,
-                              &one,
-                              lanczosVecs_dev,
-                              n,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              &zero,
-                              work_dev,
-                              1,
-                              stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_N,
-                              n,
-                              *iter,
-                              &negOne,
-                              lanczosVecs_dev,
-                              n,
-                              work_dev,
-                              1,
-                              &one,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+
+      RAFT_CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                                    work_dev + (*iter - 1),
+                                    sizeof(value_type_t),
+                                    cudaMemcpyDeviceToHost,
+                                    stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h,
-                             n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n),
-                             1,
-                             lanczosVecs_dev + IDX(0, *iter, n),
-                             1,
-                             alpha_host + (*iter - 1),
-                             stream));
+      RAFT_CUBLAS_TRY(cublasdot(cublas_h,
+                                n,
+                                lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                1,
+                                lanczosVecs_dev + IDX(0, *iter, n),
+                                1,
+                                alpha_host + (*iter - 1),
+                                stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h,
-                              n,
-                              &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n),
-                              1,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h,
-                              n,
-                              &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n),
-                              1,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 2, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(
+    RAFT_CUBLAS_TRY(cublasnrm2(
       cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
@@ -270,10 +270,10 @@ int performLanczosIteration(handle_t const& handle,
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   return 0;
 }
@@ -638,59 +638,60 @@ static int lanczosRestart(handle_t const& handle,
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
 
   beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(cublas_h,
-                          CUBLAS_OP_N,
-                          n,
-                          iter,
-                          beta_host + iter_new - 1,
-                          lanczosVecs_dev,
-                          n,
-                          V_dev + IDX(0, iter_new, iter),
-                          1,
-                          beta_host + iter - 1,
-                          lanczosVecs_dev + IDX(0, iter, n),
-                          1,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                             CUBLAS_OP_N,
+                             n,
+                             iter,
+                             beta_host + iter_new - 1,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev + IDX(0, iter_new, iter),
+                             1,
+                             beta_host + iter - 1,
+                             lanczosVecs_dev + IDX(0, iter, n),
+                             1,
+                             stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          iter_new,
-                          iter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          V_dev,
-                          iter,
-                          &zero,
-                          work_dev,
-                          n,
-                          stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
-                           work_dev,
-                           n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             iter_new,
+                             iter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev,
+                             iter,
+                             &zero,
+                             work_dev,
+                             n,
+                             stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                                work_dev,
+                                n * iter_new * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
 
   // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
-                           lanczosVecs_dev + IDX(0, iter, n),
-                           n * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                                lanczosVecs_dev + IDX(0, iter, n),
+                                n * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
 
-  CUBLAS_CHECK(cublasnrm2(
+  RAFT_CUBLAS_TRY(cublasnrm2(
     cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  RAFT_CUBLAS_TRY(
+    cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -821,7 +822,7 @@ int computeSmallestEigenvectors(handle_t const& handle,
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -837,10 +838,10 @@ int computeSmallestEigenvectors(handle_t const& handle,
   // Initialize initial Lanczos vector
   curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
   value_type_t normQ1;
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
 
   auto h_val = 1 / normQ1;
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
@@ -956,35 +957,35 @@ int computeSmallestEigenvectors(handle_t const& handle,
     work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter),
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host,
-                           (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                                work_host + 2 * (*effIter),
+                                nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev,
+                                Z_host,
+                                (*effIter) * nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          nEigVecs,
-                          *effIter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          work_dev,
-                          *effIter,
-                          &zero,
-                          eigVecs_dev,
-                          n,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
@@ -1167,7 +1168,7 @@ int computeLargestEigenvectors(handle_t const& handle,
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -1181,10 +1182,10 @@ int computeLargestEigenvectors(handle_t const& handle,
   // Initialize initial Lanczos vector
   curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
   value_type_t normQ1;
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
 
   auto h_val = 1 / normQ1;
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter               = 0;
@@ -1303,37 +1304,37 @@ int computeLargestEigenvectors(handle_t const& handle,
 
   // Copy results to device memory
   // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                                work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                                nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
 
   // skip smallest eigenvector if needed
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host + (top_eigenparis_idx_offset * (*effIter)),
-                           (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev,
+                                Z_host + (top_eigenparis_idx_offset * (*effIter)),
+                                (*effIter) * nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
 
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          nEigVecs,
-                          *effIter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          work_dev,
-                          *effIter,
-                          &zero,
-                          eigVecs_dev,
-                          n,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);

From 5d8c176de6706aecc96d925b653aae68f22edefa Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 26 Jan 2022 16:39:36 -0800
Subject: [PATCH 13/17] more macro renames

---
 cpp/include/raft/linalg/detail/map.cuh             | 2 +-
 cpp/include/raft/linalg/detail/map_then_reduce.cuh | 2 +-
 cpp/include/raft/linalg/detail/subtract.cuh        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 513432ef27..56f1dd6f19 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -40,7 +40,7 @@ void mapImpl(
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 99e04d82e7..281861b2f9 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -89,7 +89,7 @@ void mapThenReduceImpl(OutType* out,
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, neutral, map, op, in, args...);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index 767373574b..23d5eded05 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -61,7 +61,7 @@ void subtractDevScalar(math_t* outDev,
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
   subtract_dev_scalar_kernel<math_t>
     <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // end namespace detail

From 14cddfc2c1fb4db232969247cd960c44f928abe3 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 2 Feb 2022 13:12:11 -0800
Subject: [PATCH 14/17] adding explict stream set back to cublas and cusolver
 wrappers

---
 .../raft/linalg/detail/cublas_wrappers.hpp    | 36 +++++++++++++
 .../raft/linalg/detail/cusolver_wrappers.hpp  | 52 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 83890f348a..9d8d477355 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -143,6 +143,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
@@ -156,6 +157,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
 /** @} */
@@ -172,6 +174,7 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
@@ -179,6 +182,7 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
 
@@ -196,12 +200,14 @@ template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
 /** @} */
@@ -240,6 +246,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
@@ -258,6 +265,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
@@ -291,6 +299,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
@@ -307,6 +316,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 /** @} */
@@ -349,6 +359,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -369,6 +380,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -414,6 +426,7 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemmBatched(handle,
                             transa,
                             transb,
@@ -450,6 +463,7 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemmBatched(handle,
                             transa,
                             transb,
@@ -516,6 +530,7 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -558,6 +573,7 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -604,6 +620,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -617,6 +634,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -645,6 +663,7 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -661,6 +680,7 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -701,6 +721,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -720,6 +741,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -762,6 +784,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
@@ -781,6 +804,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
@@ -821,6 +845,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -840,6 +865,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -876,6 +902,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
@@ -893,6 +920,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 /** @} */
@@ -909,6 +937,7 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
@@ -916,6 +945,7 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
@@ -950,6 +980,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -968,6 +999,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -995,6 +1027,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 float* result,
                                 cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
@@ -1008,6 +1041,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 double* result,
                                 cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
 /** @} */
@@ -1044,6 +1078,7 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
@@ -1051,6 +1086,7 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
 
diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index aac58547f8..acfd239174 100644
--- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -143,6 +143,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -157,6 +158,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -178,6 +180,7 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -190,6 +193,7 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -223,6 +227,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
@@ -239,6 +244,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
@@ -269,6 +275,7 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
   const float* W,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 
@@ -283,6 +290,7 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
   const double* W,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 /** @} */
@@ -320,6 +328,7 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -338,6 +347,7 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -365,6 +375,7 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
   int* lwork,
   syevjInfo_t params)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 
@@ -380,6 +391,7 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
   int* lwork,
   syevjInfo_t params)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 /** @} */
@@ -414,6 +426,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
@@ -430,6 +443,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
@@ -472,6 +486,7 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
   const float* W,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevdx_bufferSize(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
@@ -493,6 +508,7 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
   const double* W,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevdx_bufferSize(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
@@ -537,6 +553,7 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
@@ -561,6 +578,7 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
@@ -578,8 +596,10 @@ cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
   int* lwork)
 {
   if (std::is_same<std::decay_t<T>, float>::value) {
+    RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
     return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
   } else {
+    RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
     return cusolverDnDgesvd_bufferSize(handle, m, n, lwork);
   }
 }
@@ -622,6 +642,7 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -645,6 +666,7 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -682,6 +704,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
   int* lwork,
   gesvdjInfo_t params)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvdj_bufferSize(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
@@ -702,6 +725,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
   int* lwork,
   gesvdjInfo_t params)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvdj_bufferSize(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
@@ -744,6 +768,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -767,6 +792,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -794,6 +820,7 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
@@ -806,6 +833,7 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
@@ -831,6 +859,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
@@ -845,6 +874,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 /** @} */
@@ -877,6 +907,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
@@ -892,6 +923,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 /** @} */
@@ -923,6 +955,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
@@ -937,6 +970,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
@@ -957,6 +991,7 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 template <>
@@ -968,6 +1003,7 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 /** @} */
@@ -1003,6 +1039,7 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
@@ -1019,6 +1056,7 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
@@ -1043,6 +1081,7 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
   const float* TAU,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 template <>
@@ -1056,6 +1095,7 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
   const double* TAU,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 /** @} */
@@ -1099,6 +1139,7 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1120,6 +1161,7 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1153,6 +1195,7 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
   int ldc,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 
@@ -1171,6 +1214,7 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
   int ldc,
   int* lwork)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 /** @} */
@@ -1209,6 +1253,7 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
   size_t* internalDataInBytes,
   size_t* workspaceInBytes)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverSpScsrqrBufferInfoBatched(handle,
                                            m,
                                            n,
@@ -1238,6 +1283,7 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
   size_t* internalDataInBytes,
   size_t* workspaceInBytes)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverSpDcsrqrBufferInfoBatched(handle,
                                            m,
                                            n,
@@ -1286,6 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverSpScsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1307,6 +1354,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverSpDcsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1345,6 +1393,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1374,6 +1423,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1423,6 +1473,7 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
@@ -1458,6 +1509,7 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,

From a2f670f05d204fbe95ad520f20e770d494373acc Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 2 Feb 2022 14:53:38 -0800
Subject: [PATCH 15/17] resolving errors

---
 .../raft/linalg/detail/cusolver_wrappers.hpp  | 26 ++-----------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index acfd239174..34ec6cb673 100644
--- a/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -180,7 +180,6 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -193,7 +192,6 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -275,7 +273,6 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
   const float* W,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 
@@ -290,7 +287,6 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
   const double* W,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 /** @} */
@@ -375,7 +371,6 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
   int* lwork,
   syevjInfo_t params)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 
@@ -391,7 +386,6 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
   int* lwork,
   syevjInfo_t params)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 /** @} */
@@ -486,7 +480,6 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
   const float* W,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevdx_bufferSize(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
@@ -508,7 +501,6 @@ inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
   const double* W,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevdx_bufferSize(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
@@ -596,10 +588,8 @@ cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
   int* lwork)
 {
   if (std::is_same<std::decay_t<T>, float>::value) {
-    RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
     return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
   } else {
-    RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
     return cusolverDnDgesvd_bufferSize(handle, m, n, lwork);
   }
 }
@@ -704,7 +694,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
   int* lwork,
   gesvdjInfo_t params)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvdj_bufferSize(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
@@ -725,7 +714,6 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
   int* lwork,
   gesvdjInfo_t params)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvdj_bufferSize(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
@@ -820,7 +808,6 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
@@ -833,7 +820,6 @@ inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
@@ -991,7 +977,6 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 template <>
@@ -1003,7 +988,6 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
   int lda,
   int* Lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 /** @} */
@@ -1081,7 +1065,6 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
   const float* TAU,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 template <>
@@ -1095,7 +1078,6 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
   const double* TAU,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 /** @} */
@@ -1195,7 +1177,6 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
   int ldc,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 
@@ -1214,7 +1195,6 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
   int ldc,
   int* lwork)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 /** @} */
@@ -1253,7 +1233,6 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
   size_t* internalDataInBytes,
   size_t* workspaceInBytes)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverSpScsrqrBufferInfoBatched(handle,
                                            m,
                                            n,
@@ -1283,7 +1262,6 @@ inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
   size_t* internalDataInBytes,
   size_t* workspaceInBytes)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverSpDcsrqrBufferInfoBatched(handle,
                                            m,
                                            n,
@@ -1332,7 +1310,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream));
   return cusolverSpScsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1354,7 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream));
   return cusolverSpDcsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }

From 89bf3c1973fc9dd9d38b9de205b365367b7a820e Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 3 Feb 2022 19:38:16 -0800
Subject: [PATCH 16/17] adding set stream to cublas set pointer mode

---
 cpp/include/raft/linalg/detail/cublas_wrappers.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 9d8d477355..552bae6b7e 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -1062,6 +1062,7 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
                                            cudaStream_t stream)
 {
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
 /** @} */

From f94beefb95e962e087a9704f7e49f880d8e7085d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 7 Feb 2022 18:25:31 -0500
Subject: [PATCH 17/17] Fixing a bad merge

---
 cpp/include/raft/linalg/gemv.hpp          | 31 +++++++++++++----------
 cpp/include/raft/stats/detail/meanvar.cuh |  2 +-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 3ff919c1f4..45766b8c9a 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -17,6 +17,9 @@
 #pragma once
 
 #include "detail/gemv.hpp"
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+
+#include <raft/handle.hpp>
 
 namespace raft {
 namespace linalg {
@@ -57,20 +60,20 @@ void gemv(const raft::handle_t& handle,
           cudaStream_t stream)
 {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                             trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
-                             m,
-                             n,
-                             alpha,
-                             A,
-                             lda,
-                             x,
-                             incx,
-                             beta,
-                             y,
-                             incy,
-                             stream));
+  detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
+                                     trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     x,
+                                     incx,
+                                     beta,
+                                     y,
+                                     incy,
+                                     stream));
 }
 
 template <typename math_t>
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index ed411ef74d..e3f586fea8 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft::stats::detail {