diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 8384598805..21d04f3f8d 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -17,7 +17,7 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index 3007198f60..bead5f1f71 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 21031afef1..45850de115 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -30,7 +30,7 @@
 #include <raft/distance/detail/l1.cuh>
 #include <raft/distance/detail/minkowski.cuh>
 #include <raft/distance/detail/russell_rao.cuh>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index a8deb8df24..4786f584c4 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 6ad939ecd5..80eb6021ef 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -20,7 +20,7 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/contractions.hpp>
 #include <stdint.h>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 1874d2e942..3cb0469803 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 08911e0350..bfca731443 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -16,8 +16,8 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/contractions.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/norm.hpp>
 #include <raft/vectorized.cuh>
 
 #include <cstddef>
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 3dad7ea6d7..935cf6677a 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/distance_type.h b/cpp/include/raft/distance/distance_type.hpp
similarity index 97%
rename from cpp/include/raft/linalg/distance_type.h
rename to cpp/include/raft/distance/distance_type.hpp
index 681a83f3f8..f75263b00d 100644
--- a/cpp/include/raft/linalg/distance_type.h
+++ b/cpp/include/raft/distance/distance_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 6421ba5344..22e9e78ebe 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -33,9 +33,10 @@
 //#include <common/cuml_comms_int.hpp>
 
 #include "cudart_utils.h"
+
 #include <raft/comms/comms.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index dce732b06b..fda4c02a1c 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index a3f2411102..9cd5a29951 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -21,7 +21,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/init.h>
+#include <raft/linalg/init.hpp>
 
 namespace raft {
 namespace label {
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.hpp
similarity index 72%
rename from cpp/include/raft/linalg/add.cuh
rename to cpp/include/raft/linalg/add.hpp
index 926cc44197..2f999a45d2 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,13 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
+#include "detail/add.cuh"
 
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @brief Elementwise scalar add operation on the input buffer
  *
@@ -39,8 +40,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  detail::addScalar(out, in, scalar, len, stream);
 }
 
 /**
@@ -59,18 +59,7 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
-}
-
-template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t* outDev,
-                                      const math_t* inDev,
-                                      const math_t* singleScalarDev,
-                                      IdxType len)
-{
-  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
+  detail::add(out, in1, in2, len, stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
@@ -90,11 +79,7 @@ void addDevScalar(math_t* outDev,
                   IdxType len,
                   cudaStream_t stream)
 {
-  // TODO: block dimension has not been tuned
-  dim3 block(256);
-  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
new file mode 100644
index 0000000000..5c73b6d3c5
--- /dev/null
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/binary_op.cuh"
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise binary operation on the input arrays
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::binaryOp(out, in1, in2, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.hpp
similarity index 56%
rename from cpp/include/raft/linalg/cholesky_r1_update.cuh
rename to cpp/include/raft/linalg/cholesky_r1_update.hpp
index 1745b0dcc8..583c65c50e 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include "detail/cholesky_r1_update.hpp"
 
 namespace raft {
 namespace linalg {
@@ -132,94 +128,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
                          cudaStream_t stream,
                          math_t eps = -1)
 {
-  // The matrix A' is defined as:
-  // A' = [[A_11, A_12]
-  //       [A_21, A_22]]
-  // where:
-  // - A_11 = A, matrix of size (n-1)x(n-1)
-  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
-  // - A_22 = A_new[n-1] scalar.
-  //
-  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
-  // we just update L with the new row. The new Cholesky decomposition will be
-  // calculated as:
-  // L' = [[L_11,    0]
-  //       [L_12, L_22]]
-  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
-  // L_12 and L_22 are the new quantities that we need to calculate.
-
-  // We need a workspace in device memory to store a scalar. Additionally, in
-  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
-  const int align = 256;
-  int offset =
-    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
-  if (workspace == nullptr) {
-    *n_bytes = offset + 1 * sizeof(math_t);
-    return;
-  }
-  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
-  math_t* L_22 = L + (n - 1) * ld + n - 1;
-
-  math_t* A_new = nullptr;
-  math_t* A_row = nullptr;
-  if (uplo == CUBLAS_FILL_MODE_UPPER) {
-    // A_new is stored as the n-1 th column of L
-    A_new = L + (n - 1) * ld;
-  } else {
-    // If the input is lower triangular, then the new elements of A are stored
-    // as the n-th row of L. Since the matrix is column major, this is non
-    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
-    A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t*>(workspace);
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
-  }
-  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  if (n > 1) {
-    // Calculate L_12 = x by solving equation L_11 x = A_12
-    math_t alpha = 1;
-    RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                             CUBLAS_SIDE_LEFT,
-                                             uplo,
-                                             op,
-                                             CUBLAS_DIAG_NON_UNIT,
-                                             n - 1,
-                                             1,
-                                             &alpha,
-                                             L,
-                                             ld,
-                                             A_new,
-                                             n - 1,
-                                             stream));
-
-    // A_new now stores L_12, we calculate s = L_12 * L_12
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
-
-    if (uplo == CUBLAS_FILL_MODE_LOWER) {
-      // Copy back the L_12 elements as the n-th row of L
-      RAFT_CUBLAS_TRY(
-        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
-    }
-  } else {  // n == 1 case
-    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
-  }
-
-  // L_22 = sqrt(A_22 - L_12 * L_12)
-  math_t s_host;
-  math_t L_22_host;
-  raft::update_host(&s_host, s, 1, stream);
-  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  L_22_host = std::sqrt(L_22_host - s_host);
-
-  // Check for numeric error with sqrt. If the matrix is not positive definit or
-  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
-  // negative, which would result L_22 = NaN. A small positive eps parameter
-  // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
-  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
-  raft::update_device(L_22, &L_22_host, 1, stream);
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
 }
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
new file mode 100644
index 0000000000..0f1ca9202d
--- /dev/null
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/coalesced_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the leading dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
new file mode 100644
index 0000000000..e317588b1d
--- /dev/null
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/contractions.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief This is the central enum that should be used to configure the perf
+ *        landscape of the Contraction kernel.
+ *
+ * Main goal of this Policy struct is to provide sufficient knobs to tune the
+ * perf of Contraction kernel, as and when we see matrices of different shapes.
+ *
+ * @tparam DataT   the IO and math datatype
+ * @tparam _veclen number of k-elements loaded by each thread for every LDG call
+ *                 it makes. This should be configured based on the input 'k'
+ *                 value and the input data type. For eg: if DataT = float and
+ *                 k is multiples of 4, then setting this to 4 gives the best
+ *                 LDG pattern. Possible values are {1, 2, 4}.
+ * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
+ *                 Therefore total number of main-loop iterations will be
+ *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
+ *                 that bigger this value, the greater shared mem requirement.
+ * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of m-elements worked upon
+ *                 by each thread block.
+ * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of n-elements worked upon
+ *                 by each thread block.
+ * @tparam _tr     Number of threads working on the same output column. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ * @tparam _tc     Number of threads working on the same output row. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ */
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct KernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single row */
+    LdgThRow = Kblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Mblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Nblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Mblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Nblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Kblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Mblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Nblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // enum
+
+};  // struct KernelPolicy
+
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct ColKernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single col */
+    LdgThRow = Mblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Kblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Kblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Kblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Kblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Mblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Kblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Kblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // colMajor enum
+  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
+};
+/**
+ * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
+ * @{
+ */
+template <typename DataT, int _veclen>
+struct Policy4x4 {
+};
+
+template <int _veclen>
+struct Policy4x4<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy4x4<double, _veclen> {
+  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
+};
+/** @} */
+
+/**
+ * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
+ * @{
+ */
+template <typename DataT, int _veclen = 1>
+struct Policy2x8 {
+};
+
+template <int _veclen>
+struct Policy2x8<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
+  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy2x8<double, _veclen> {
+  // this is not used just for keeping compiler happy.
+  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
+  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
+};
+/** @} */
+
+/**
+ * @brief Base class for gemm-like NT contractions
+ *
+ * This class does not provide any arithmetic operations, but only provides the
+ * memory-related operations of loading the `x` and `y` matrix blocks from the
+ * global memory into shared memory and then from shared into registers. Thus,
+ * this class acts as a basic building block for further composing gemm-like NT
+ * contractions on input matrices which are row-major (and so does the output)
+ *
+ * @tparam DataT  IO and math data type
+ * @tparam IdxT   indexing type
+ * @tparam Policy policy used to customize memory access behavior.
+ *                See documentation for `KernelPolicy` to know more.
+ */
+using detail::Contractions_NT;
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
new file mode 100644
index 0000000000..794a776dcf
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
+}
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
+}
+
+template <class math_t, typename IdxType>
+__global__ void add_dev_scalar_kernel(math_t* outDev,
+                                      const math_t* inDev,
+                                      const math_t* singleScalarDev,
+                                      IdxType len)
+{
+  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
+}
+
+template <typename math_t, typename IdxType = int>
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
+  // TODO: block dimension has not been tuned
+  dim3 block(256);
+  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
+  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
similarity index 81%
rename from cpp/include/raft/linalg/binary_op.cuh
rename to cpp/include/raft/linalg/detail/binary_op.cuh
index 00a2af0014..6b1f8bc6d7 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
 __global__ void binaryOpKernel(
@@ -60,22 +60,6 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
-/**
- * @brief perform element-wise binary operation on the input arrays
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val1, const InType& val2);`
- */
 template <typename InType,
           typename Lambda,
           typename OutType = InType,
@@ -109,5 +93,6 @@ void binaryOp(
   }
 }
 
-};  // end namespace linalg
-};  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
new file mode 100644
index 0000000000..335544e094
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/binary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
+  // The matrix A' is defined as:
+  // A' = [[A_11, A_12]
+  //       [A_21, A_22]]
+  // where:
+  // - A_11 = A, matrix of size (n-1)x(n-1)
+  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
+  // - A_22 = A_new[n-1] scalar.
+  //
+  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
+  // we just update L with the new row. The new Cholesky decomposition will be
+  // calculated as:
+  // L' = [[L_11,    0]
+  //       [L_12, L_22]]
+  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
+  // L_12 and L_22 are the new quantities that we need to calculate.
+
+  // We need a workspace in device memory to store a scalar. Additionally, in
+  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
+  const int align = 256;
+  int offset =
+    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
+  if (workspace == nullptr) {
+    *n_bytes = offset + 1 * sizeof(math_t);
+    return;
+  }
+  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
+  math_t* L_22 = L + (n - 1) * ld + n - 1;
+
+  math_t* A_new = nullptr;
+  math_t* A_row = nullptr;
+  if (uplo == CUBLAS_FILL_MODE_UPPER) {
+    // A_new is stored as the n-1 th column of L
+    A_new = L + (n - 1) * ld;
+  } else {
+    // If the input is lower triangular, then the new elements of A are stored
+    // as the n-th row of L. Since the matrix is column major, this is non
+    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
+    A_row = L + n - 1;
+    A_new = reinterpret_cast<math_t*>(workspace);
+    RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
+  }
+  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  if (n > 1) {
+    // Calculate L_12 = x by solving equation L_11 x = A_12
+    math_t alpha = 1;
+    RAFT_CUBLAS_TRY(cublastrsm(handle.get_cublas_handle(),
+                               CUBLAS_SIDE_LEFT,
+                               uplo,
+                               op,
+                               CUBLAS_DIAG_NON_UNIT,
+                               n - 1,
+                               1,
+                               &alpha,
+                               L,
+                               ld,
+                               A_new,
+                               n - 1,
+                               stream));
+
+    // A_new now stores L_12, we calculate s = L_12 * L_12
+    RAFT_CUBLAS_TRY(cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
+
+    if (uplo == CUBLAS_FILL_MODE_LOWER) {
+      // Copy back the L_12 elements as the n-th row of L
+      RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
+    }
+  } else {  // n == 1 case
+    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
+  }
+
+  // L_22 = sqrt(A_22 - L_12 * L_12)
+  math_t s_host;
+  math_t L_22_host;
+  raft::update_host(&s_host, s, 1, stream);
+  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  L_22_host = std::sqrt(L_22_host - s_host);
+
+  // Check for numeric error with sqrt. If the matrix is not positive definit or
+  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
+  // negative, which would result L_22 = NaN. A small positive eps parameter
+  // can be used to prevent this.
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
+  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
+  raft::update_device(L_22, &L_22_host, 1, stream);
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
similarity index 71%
rename from cpp/include/raft/linalg/coalesced_reduction.cuh
rename to cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index 717e2c42b2..7e545e4932 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 // Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
@@ -61,33 +62,6 @@ __global__ void coalescedReductionKernel(OutType* dots,
   }
 }
 
-/**
- * @brief Compute reduction of the input matrix along the leading dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
- */
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
@@ -123,5 +97,6 @@ void coalescedReduction(OutType* dots,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-};  // end namespace linalg
-};  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
similarity index 54%
rename from cpp/include/raft/linalg/contractions.cuh
rename to cpp/include/raft/linalg/detail/contractions.cuh
index 817bfeab5c..40d0839f60 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,187 +20,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief This is the central enum that should be used to configure the perf
- *        landscape of the Contraction kernel.
- *
- * Main goal of this Policy struct is to provide sufficient knobs to tune the
- * perf of Contraction kernel, as and when we see matrices of different shapes.
- *
- * @tparam DataT   the IO and math datatype
- * @tparam _veclen number of k-elements loaded by each thread for every LDG call
- *                 it makes. This should be configured based on the input 'k'
- *                 value and the input data type. For eg: if DataT = float and
- *                 k is multiples of 4, then setting this to 4 gives the best
- *                 LDG pattern. Possible values are {1, 2, 4}.
- * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
- *                 Therefore total number of main-loop iterations will be
- *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
- *                 that bigger this value, the greater shared mem requirement.
- * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of m-elements worked upon
- *                 by each thread block.
- * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of n-elements worked upon
- *                 by each thread block.
- * @tparam _tr     Number of threads working on the same output column. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- * @tparam _tc     Number of threads working on the same output row. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct KernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single row */
-    LdgThRow = Kblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Mblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Nblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Mblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Nblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Kblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Mblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Nblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // enum
-
-};  // struct KernelPolicy
-
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct ColKernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single col */
-    LdgThRow = Mblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Kblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Kblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Kblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Kblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Mblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Kblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Kblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // colMajor enum
-  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
-};
-/**
- * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
- * @{
- */
-template <typename DataT, int _veclen>
-struct Policy4x4 {
-};
-
-template <int _veclen>
-struct Policy4x4<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
-};
-
-template <int _veclen>
-struct Policy4x4<double, _veclen> {
-  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
-};
-/** @} */
-
-/**
- * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
- * @{
- */
-template <typename DataT, int _veclen = 1>
-struct Policy2x8 {
-};
-
-template <int _veclen>
-struct Policy2x8<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
-  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
-};
-
-template <int _veclen>
-struct Policy2x8<double, _veclen> {
-  // this is not used just for keeping compiler happy.
-  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
-  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
-};
-/** @} */
-
-/**
- * @brief Base class for gemm-like NT contractions
- *
- * This class does not provide any arithmetic operations, but only provides the
- * memory-related operations of loading the `x` and `y` matrix blocks from the
- * global memory into shared memory and then from shared into registers. Thus,
- * this class acts as a basic building block for further composing gemm-like NT
- * contractions on input matrices which are row-major (and so does the output)
- *
- * @tparam DataT  IO and math data type
- * @tparam IdxT   indexing type
- * @tparam Policy policy used to customize memory access behavior.
- *                See documentation for `KernelPolicy` to know more.
- */
 template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
 struct Contractions_NT {
  protected:
@@ -492,5 +313,6 @@ struct Contractions_NT {
 
 };  // struct Contractions_NT
 
+}  // namespace detail
 }  // namespace linalg
-}  // namespace raft
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
similarity index 94%
rename from cpp/include/raft/linalg/cublas_wrappers.h
rename to cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 246e6466d8..752235d246 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,6 +117,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * Assuming the default CUBLAS_POINTER_MODE_HOST, change it to host or device mode
@@ -171,7 +172,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
@@ -185,7 +186,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
 /** @} */
@@ -202,7 +203,7 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
@@ -210,7 +211,7 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
 
@@ -228,14 +229,14 @@ template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
 /** @} */
@@ -274,7 +275,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
@@ -293,7 +294,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
@@ -327,7 +328,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
@@ -344,7 +345,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 /** @} */
@@ -387,7 +388,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -408,7 +409,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -454,7 +455,7 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemmBatched(handle,
                             transa,
                             transb,
@@ -491,7 +492,7 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemmBatched(handle,
                             transa,
                             transb,
@@ -558,7 +559,7 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -601,7 +602,7 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -648,7 +649,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -662,7 +663,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -691,7 +692,7 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -708,7 +709,7 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -749,7 +750,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -769,7 +770,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -812,7 +813,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
@@ -832,7 +833,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
@@ -873,7 +874,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -893,7 +894,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -930,7 +931,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
@@ -948,7 +949,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 /** @} */
@@ -965,7 +966,7 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
@@ -973,7 +974,7 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
@@ -1008,7 +1009,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -1027,7 +1028,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -1055,7 +1056,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 float* result,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
@@ -1069,7 +1070,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 double* result,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
 /** @} */
@@ -1090,7 +1091,7 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
                                            cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
 /** @} */
@@ -1107,7 +1108,7 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
@@ -1115,11 +1116,12 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
 
 /** @} */
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
similarity index 95%
rename from cpp/include/raft/linalg/cusolver_wrappers.h
rename to cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 988e7512d5..34ec6cb673 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,6 +115,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * @defgroup Getrf cusolver getrf operations
@@ -142,7 +143,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -157,7 +158,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -224,7 +225,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
@@ -241,7 +242,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
@@ -323,7 +324,7 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -342,7 +343,7 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -419,7 +420,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
@@ -436,12 +437,11 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
 
-#if CUDART_VERSION >= 10010
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
@@ -545,7 +545,7 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
@@ -570,12 +570,11 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 /** @} */
-#endif
 
 /**
  * @defgroup svd cusolver svd operations
@@ -633,7 +632,7 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -657,7 +656,7 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -757,7 +756,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -781,7 +780,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -846,7 +845,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
@@ -861,7 +860,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 /** @} */
@@ -894,7 +893,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
@@ -910,7 +909,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 /** @} */
@@ -942,7 +941,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
@@ -957,7 +956,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
@@ -1024,7 +1023,7 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
@@ -1041,7 +1040,7 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
@@ -1122,7 +1121,7 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1144,7 +1143,7 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1311,7 +1310,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream));
   return cusolverSpScsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1333,7 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream));
   return cusolverSpDcsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1372,7 +1371,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1402,7 +1401,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1452,7 +1451,7 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
@@ -1488,7 +1487,7 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
@@ -1509,5 +1508,6 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
 /** @} */
 #endif
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp
new file mode 100644
index 0000000000..c694529fb5
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/divide.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, typename IdxType = int>
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, divides_scalar<math_t>(scalar), stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/detail/eig.hpp
similarity index 86%
rename from cpp/include/raft/linalg/eig.cuh
rename to cpp/include/raft/linalg/detail/eig.hpp
index 200f69a88a..8716b4de29 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,17 +16,18 @@
 
 #pragma once
 
+#include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 template <typename math_t>
 void eigDC_legacy(const raft::handle_t& handle,
@@ -73,19 +74,6 @@ void eigDC_legacy(const raft::handle_t& handle,
          "This usually occurs when some of the features do not vary enough.");
 }
 
-/**
- * @defgroup eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void eigDC(const raft::handle_t& handle,
            const math_t* in,
@@ -149,22 +137,6 @@ void eigDC(const raft::handle_t& handle,
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
 
-#if CUDART_VERSION >= 10010
-
-/**
- * @defgroup eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param n_eig_vals: number of eigenvectors to be generated
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void eigSelDC(const raft::handle_t& handle,
               math_t* in,
@@ -256,39 +228,23 @@ void eigSelDC(const raft::handle_t& handle,
   }
 }
 
-#endif
-
-/**
- * @defgroup overloaded function for eig decomp with Jacobi method for the
- * column-major symmetric matrices (in parameter)
- * @param handle: raft handle
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- * @{
- */
 template <typename math_t>
 void eigJacobi(const raft::handle_t& handle,
                const math_t* in,
-               std::size_t n_rows,
-               std::size_t n_cols,
+               int n_rows,
+               int n_cols,
                math_t* eig_vectors,
                math_t* eig_vals,
                cudaStream_t stream,
-               math_t tol           = 1.e-7,
-               std::uint32_t sweeps = 15)
+               math_t tol = 1.e-7,
+               int sweeps = 15)
 {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
   RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
   RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
+  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
 
   int lwork;
   RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH,
@@ -326,5 +282,6 @@ void eigJacobi(const raft::handle_t& handle,
   RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
 }
 
-};  // end namespace linalg
-};  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp
new file mode 100644
index 0000000000..b15717f205
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/eltwise.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, multiplies_scalar<InType, OutType>(scalar), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
new file mode 100644
index 0000000000..067b1565e0
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/functional.cuh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/functional.h>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct divides_scalar {
+ public:
+  divides_scalar(ArgType scalar) : scalar_(scalar) {}
+
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in / scalar_; }
+
+ private:
+  ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct adds_scalar {
+ public:
+  adds_scalar(ArgType scalar) : scalar_(scalar) {}
+
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in + scalar_; }
+
+ private:
+  ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct multiplies_scalar {
+ public:
+  multiplies_scalar(ArgType scalar) : scalar_(scalar) {}
+
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in * scalar_; }
+
+ private:
+  ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct divides_check_zero {
+ public:
+  __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b)
+  {
+    return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
+  }
+};
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/detail/gemm.hpp
similarity index 88%
rename from cpp/include/raft/linalg/gemm.cuh
rename to cpp/include/raft/linalg/detail/gemm.hpp
index b5147915ba..8a02b702e5 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include "cublas_wrappers.hpp"
 #include <cublas_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * @brief the wrapper of cublas gemm function
@@ -146,25 +147,6 @@ void gemm(const raft::handle_t& handle,
     handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
-/**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible
- * combinations of operand layouts.
- * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
- * @param handle raft handle
- * @param z output matrix of size M rows x N columns
- * @param x input matrix of size M rows x K columns
- * @param y input matrix of size K rows x N columns
- * @param _M number of rows of X and Z
- * @param _N number of rows of Y and columns of Z
- * @param _K number of columns of X and rows of Y
- * @param isZColMajor Storage layout of Z. true = col major, false = row major
- * @param isXColMajor Storage layout of X. true = col major, false = row major
- * @param isYColMajor Storage layout of Y. true = col major, false = row major
- * @param stream cuda stream
- * @param alpha scalar
- * @param beta scalar
- */
 template <typename T>
 void gemm(const raft::handle_t& handle,
           T* z,
@@ -253,5 +235,6 @@ void gemm(const raft::handle_t& handle,
     cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
-}  // end namespace linalg
-}  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
new file mode 100644
index 0000000000..991268cf26
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+
+#include "cublas_wrappers.hpp"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  RAFT_CUBLAS_TRY(
+    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  RAFT_CUBLAS_TRY(
+    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+}
+
+};  // namespace detail
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/detail/init.hpp
similarity index 81%
rename from cpp/include/raft/linalg/init.h
rename to cpp/include/raft/linalg/detail/init.hpp
index 03d4f99e90..4718a2cb0e 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/detail/init.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,19 +23,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-namespace {
-
-/**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [end-start]
- * \param [in] start of the range
- * \param [in] end of range (exclusive)
- * \param [in] stream cuda stream
- */
 template <typename T>
 void range(T* out, int start, int end, cudaStream_t stream)
 {
@@ -59,6 +48,7 @@ void range(T* out, int n, cudaStream_t stream)
 {
   range(out, 0, n, stream);
 }
-}  // unnamed namespace
+
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
new file mode 100644
index 0000000000..c761c06c14
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -0,0 +1,1401 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// for cmath:
+#define _USE_MATH_DEFINES
+
+#include <cmath>
+#include <vector>
+
+#include <cuda.h>
+#include <curand.h>
+
+#include "cublas_wrappers.hpp"
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/spectral/lapack.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/warn_dbg.hpp>
+
+namespace raft {
+
+using namespace matrix;
+using namespace linalg::detail;
+
+namespace spectral {
+namespace detail {
+
+// curandGeneratorNormalX
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
+  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+}
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
+  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
+}
+
+// =========================================================
+// Helper functions
+// =========================================================
+
+/**
+ *  @brief  Perform Lanczos iteration
+ *    Lanczos iteration is performed on a shifted matrix A+shift*I.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param iter Pointer to current Lanczos iteration. On exit, the
+ *    variable is set equal to the final Lanczos iteration.
+ *  @param maxIter Maximum Lanczos iteration. This function will
+ *    perform a maximum of maxIter-*iter iterations.
+ *  @param shift Matrix shift.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm (i.e. entry in beta_host) is
+ *    less than tol.
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param alpha_host (Output, host memory, maxIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, maxIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Input/output, device memory,
+ *    n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as
+ *    columns of a column-major matrix with dimensions
+ *    n x (maxIter+1).
+ *  @param work_dev (Output, device memory, maxIter entries)
+ *    Workspace. Not needed if full reorthogonalization is disabled.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t>
+int performLanczosIteration(handle_t const& handle,
+                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            index_type_t* iter,
+                            index_type_t maxIter,
+                            value_type_t shift,
+                            value_type_t tol,
+                            bool reorthogonalize,
+                            value_type_t* __restrict__ alpha_host,
+                            value_type_t* __restrict__ beta_host,
+                            value_type_t* __restrict__ lanczosVecs_dev,
+                            value_type_t* __restrict__ work_dev)
+{
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Useful variables
+  constexpr value_type_t one    = 1;
+  constexpr value_type_t negOne = -1;
+  constexpr value_type_t zero   = 0;
+  value_type_t alpha;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
+
+  index_type_t n = A->nrows_;
+
+  // -------------------------------------------------------
+  // Compute second Lanczos vector
+  // -------------------------------------------------------
+  if (*iter <= 0) {
+    *iter = 1;
+
+    // Apply matrix
+    if (shift != 0)
+      RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                                    lanczosVecs_dev,
+                                    n * sizeof(value_type_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+    A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
+
+    // Orthogonalize Lanczos vector
+    RAFT_CUBLAS_TRY(cublasdot(
+      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
+
+    alpha = -alpha_host[0];
+    RAFT_CUBLAS_TRY(cublasaxpy(
+      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
+
+    // Check if Lanczos has converged
+    if (beta_host[0] <= tol) return 0;
+
+    // Normalize Lanczos vector
+    alpha = 1 / beta_host[0];
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+  }
+
+  // -------------------------------------------------------
+  // Compute remaining Lanczos vectors
+  // -------------------------------------------------------
+
+  while (*iter < maxIter) {
+    ++(*iter);
+
+    // Apply matrix
+    if (shift != 0)
+      RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                                    lanczosVecs_dev + (*iter - 1) * n,
+                                    n * sizeof(value_type_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
+
+    // Full reorthogonalization
+    //   "Twice is enough" algorithm per Kahan and Parlett
+    if (reorthogonalize) {
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+
+      RAFT_CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                                    work_dev + (*iter - 1),
+                                    sizeof(value_type_t),
+                                    cudaMemcpyDeviceToHost,
+                                    stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+    }
+
+    // Orthogonalization with 3-term recurrence relation
+    else {
+      RAFT_CUBLAS_TRY(cublasdot(cublas_h,
+                                n,
+                                lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                1,
+                                lanczosVecs_dev + IDX(0, *iter, n),
+                                1,
+                                alpha_host + (*iter - 1),
+                                stream));
+
+      auto alpha = -alpha_host[*iter - 1];
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+
+      alpha = -beta_host[*iter - 2];
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 2, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+    }
+
+    // Compute residual
+    RAFT_CUBLAS_TRY(cublasnrm2(
+      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
+
+    // Check if Lanczos has converged
+    if (beta_host[*iter - 1] <= tol) break;
+
+    // Normalize Lanczos vector
+    alpha = 1 / beta_host[*iter - 1];
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+  }
+
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+  return 0;
+}
+
+/**
+ *  @brief  Find Householder transform for 3-dimensional system
+ *    Given an input vector v=[x,y,z]', this function finds a
+ *    Householder transform P such that P*v is a multiple of
+ *    e_1=[1,0,0]'. The input vector v is overwritten with the
+ *    Householder vector such that P=I-2*v*v'.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param v (Input/output, host memory, 3 entries) Input
+ *    3-dimensional vector. On exit, the vector is set to the
+ *    Householder vector.
+ *  @param Pv (Output, host memory, 1 entry) First entry of P*v
+ *    (here v is the input vector). Either equal to ||v||_2 or
+ *    -||v||_2.
+ *  @param P (Output, host memory, 9 entries) Householder transform
+ *    matrix. Matrix dimensions are 3 x 3.
+ */
+template <typename index_type_t, typename value_type_t>
+static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
+{
+  // Compute norm of vector
+  *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+
+  // Choose whether to reflect to e_1 or -e_1
+  //   This choice avoids catastrophic cancellation
+  if (v[0] >= 0) *Pv = -(*Pv);
+  v[0] -= *Pv;
+
+  // Normalize Householder vector
+  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  if (normHouseholder != 0) {
+    v[0] /= normHouseholder;
+    v[1] /= normHouseholder;
+    v[2] /= normHouseholder;
+  } else {
+    v[0] = 0;
+    v[1] = 0;
+    v[2] = 0;
+  }
+
+  // Construct Householder matrix
+  index_type_t i, j;
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i)
+      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i)
+    P[IDX(i, i, 3)] += 1;
+}
+
+/**
+ *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
+ *    The Householder transform is pre-applied to the top three rows
+ *  of the matrix and post-applied to the left three columns. The
+ *  4 x 4 matrix is intended to contain the bulge that is produced
+ *  in the Francis QR algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param v (Input, host memory, 3 entries) Householder vector.
+ *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
+ */
+template <typename index_type_t, typename value_type_t>
+static void applyHouseholder3(const value_type_t* v, value_type_t* A)
+{
+  // Loop indices
+  index_type_t i, j;
+  // Dot product between Householder vector and matrix row/column
+  value_type_t vDotA;
+
+  // Pre-apply Householder transform
+  for (j = 0; j < 4; ++j) {
+    vDotA = 0;
+    for (i = 0; i < 3; ++i)
+      vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i)
+      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+  }
+
+  // Post-apply Householder transform
+  for (i = 0; i < 4; ++i) {
+    vDotA = 0;
+    for (j = 0; j < 3; ++j)
+      vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j)
+      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+  }
+}
+
+/**
+ *  @brief  Perform one step of Francis QR algorithm
+ *    Equivalent to two steps of the classical QR algorithm on a
+ *    tridiagonal matrix.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Matrix dimension.
+ *  @param shift1 QR algorithm shift.
+ *  @param shift2 QR algorithm shift.
+ *  @param alpha (Input/output, host memory, n entries) Diagonal
+ *    entries of tridiagonal matrix.
+ *  @param beta (Input/output, host memory, n-1 entries)
+ *    Off-diagonal entries of tridiagonal matrix.
+ *  @param V (Input/output, host memory, n*n entries) Orthonormal
+ *    transforms from previous steps of QR algorithm. Matrix
+ *    dimensions are n x n. On exit, the orthonormal transform from
+ *    this Francis QR step is post-applied to the matrix.
+ *  @param work (Output, host memory, 3*n entries) Workspace.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t>
+static int francisQRIteration(index_type_t n,
+                              value_type_t shift1,
+                              value_type_t shift2,
+                              value_type_t* alpha,
+                              value_type_t* beta,
+                              value_type_t* V,
+                              value_type_t* work)
+{
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Temporary storage of 4x4 bulge and Householder vector
+  value_type_t bulge[16];
+
+  // Householder vector
+  value_type_t householder[3];
+  // Householder matrix
+  value_type_t householderMatrix[3 * 3];
+
+  // Shifts are roots of the polynomial p(x)=x^2+b*x+c
+  value_type_t b = -shift1 - shift2;
+  value_type_t c = shift1 * shift2;
+
+  // Loop indices
+  index_type_t i, j, pos;
+  // Temporary variable
+  value_type_t temp;
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Compute initial Householder transform
+  householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
+  householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
+  householder[2] = beta[0] * beta[1];
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
+
+  // Apply initial Householder transform to create bulge
+  memset(bulge, 0, 16 * sizeof(value_type_t));
+  for (i = 0; i < 4; ++i)
+    bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 3; ++i) {
+    bulge[IDX(i + 1, i, 4)] = beta[i];
+    bulge[IDX(i, i + 1, 4)] = beta[i];
+  }
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
+  memcpy(V, work, 3 * n * sizeof(value_type_t));
+
+  // Chase bulge to bottom-right of matrix with Householder transforms
+  for (pos = 0; pos < n - 4; ++pos) {
+    // Move to next position
+    alpha[pos]     = bulge[IDX(0, 0, 4)];
+    householder[0] = bulge[IDX(1, 0, 4)];
+    householder[1] = bulge[IDX(2, 0, 4)];
+    householder[2] = bulge[IDX(3, 0, 4)];
+    for (j = 0; j < 3; ++j)
+      for (i = 0; i < 3; ++i)
+        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    bulge[IDX(3, 0, 4)] = 0;
+    bulge[IDX(3, 1, 4)] = 0;
+    bulge[IDX(3, 2, 4)] = beta[pos + 3];
+    bulge[IDX(0, 3, 4)] = 0;
+    bulge[IDX(1, 3, 4)] = 0;
+    bulge[IDX(2, 3, 4)] = beta[pos + 3];
+    bulge[IDX(3, 3, 4)] = alpha[pos + 4];
+
+    // Apply Householder transform
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
+    applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+    Lapack<value_type_t>::gemm(
+      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
+    memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
+  }
+
+  // Apply penultimate Householder transform
+  //   Values in the last row and column are zero
+  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
+  householder[0] = bulge[IDX(1, 0, 4)];
+  householder[1] = bulge[IDX(2, 0, 4)];
+  householder[2] = bulge[IDX(3, 0, 4)];
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  bulge[IDX(3, 0, 4)] = 0;
+  bulge[IDX(3, 1, 4)] = 0;
+  bulge[IDX(3, 2, 4)] = 0;
+  bulge[IDX(0, 3, 4)] = 0;
+  bulge[IDX(1, 3, 4)] = 0;
+  bulge[IDX(2, 3, 4)] = 0;
+  bulge[IDX(3, 3, 4)] = 0;
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
+  memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
+
+  // Apply final Householder transform
+  //   Values in the last two rows and columns are zero
+  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
+  householder[0] = bulge[IDX(1, 0, 4)];
+  householder[1] = bulge[IDX(2, 0, 4)];
+  householder[2] = 0;
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
+  memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
+
+  // Bulge has been eliminated
+  alpha[n - 2] = bulge[IDX(0, 0, 4)];
+  alpha[n - 1] = bulge[IDX(1, 1, 4)];
+  beta[n - 2]  = bulge[IDX(1, 0, 4)];
+
+  return 0;
+}
+
+/**
+ *  @brief  Perform implicit restart of Lanczos algorithm
+ *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Matrix dimension.
+ *  @param iter Current Lanczos iteration.
+ *  @param iter_new Lanczos iteration after restart.
+ *  @param shiftUpper Pointer (host memory) to upper bound for unwanted
+ *    region. Value is ignored if less than *shiftLower. If a
+ *    stronger upper bound has been found, the value is updated on
+ *    exit.
+ *  @param shiftLower Pointer (host memory) to lower bound for unwanted
+ *    region. Value is ignored if greater than *shiftUpper. If a
+ *    stronger lower bound has been found, the value is updated on
+ *    exit.
+ *  @param alpha_host (Input/output, host memory, iter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Input/output, host memory, iter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param V_host (Output, host memory, iter*iter entries)
+ *    Orthonormal transform used to obtain restarted system. Matrix
+ *    dimensions are iter x iter.
+ *  @param work_host (Output, host memory, 4*iter entries)
+ *    Workspace.
+ *  @param lanczosVecs_dev (Input/output, device memory, n*(iter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (iter+1).
+ *  @param work_dev (Output, device memory, (n+iter)*iter entries)
+ *    Workspace.
+ *  @param smallest_eig specifies whether smallest (true) or largest
+ *    (false) eigenvalues are to be calculated.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+static int lanczosRestart(handle_t const& handle,
+                          index_type_t n,
+                          index_type_t iter,
+                          index_type_t iter_new,
+                          value_type_t* shiftUpper,
+                          value_type_t* shiftLower,
+                          value_type_t* __restrict__ alpha_host,
+                          value_type_t* __restrict__ beta_host,
+                          value_type_t* __restrict__ V_host,
+                          value_type_t* __restrict__ work_host,
+                          value_type_t* __restrict__ lanczosVecs_dev,
+                          value_type_t* __restrict__ work_dev,
+                          bool smallest_eig)
+{
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Useful constants
+  constexpr value_type_t zero = 0;
+  constexpr value_type_t one  = 1;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // Loop index
+  index_type_t i;
+
+  // Number of implicit restart steps
+  //   Assumed to be even since each call to Francis algorithm is
+  //   equivalent to two calls of QR algorithm
+  index_type_t restartSteps = iter - iter_new;
+
+  // Ritz values from Lanczos method
+  value_type_t* ritzVals_host = work_host + 3 * iter;
+  // Shifts for implicit restart
+  value_type_t* shifts_host;
+
+  // Orthonormal matrix for similarity transform
+  value_type_t* V_dev = work_dev + n * iter;
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Compute Ritz values
+  memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t));
+  memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::sterf(iter, ritzVals_host, work_host);
+
+  // Debug: Print largest eigenvalues
+  // for (int i = iter-iter_new; i < iter; ++i)
+  //  std::cout <<*(ritzVals_host+i)<< " ";
+  // std::cout <<std::endl;
+
+  // Initialize similarity transform with identity matrix
+  memset(V_host, 0, iter * iter * sizeof(value_type_t));
+  for (i = 0; i < iter; ++i)
+    V_host[IDX(i, i, iter)] = 1;
+
+  // Determine interval to suppress eigenvalues
+  if (smallest_eig) {
+    if (*shiftLower > *shiftUpper) {
+      *shiftUpper = ritzVals_host[iter - 1];
+      *shiftLower = ritzVals_host[iter_new];
+    } else {
+      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]);
+      *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]);
+    }
+  } else {
+    if (*shiftLower > *shiftUpper) {
+      *shiftUpper = ritzVals_host[iter - iter_new - 1];
+      *shiftLower = ritzVals_host[0];
+    } else {
+      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]);
+      *shiftLower = std::min(*shiftLower, ritzVals_host[0]);
+    }
+  }
+
+  // Calculate Chebyshev nodes as shifts
+  shifts_host = ritzVals_host;
+  for (i = 0; i < restartSteps; ++i) {
+    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
+    shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
+  }
+
+  // Apply Francis QR algorithm to implicitly restart Lanczos
+  for (i = 0; i < restartSteps; i += 2)
+    if (francisQRIteration(
+          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
+      WARNING("error in implicitly shifted QR algorithm");
+
+  // Obtain new residual
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
+    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                             CUBLAS_OP_N,
+                             n,
+                             iter,
+                             beta_host + iter_new - 1,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev + IDX(0, iter_new, iter),
+                             1,
+                             beta_host + iter - 1,
+                             lanczosVecs_dev + IDX(0, iter, n),
+                             1,
+                             stream));
+
+  // Obtain new Lanczos vectors
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             iter_new,
+                             iter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev,
+                             iter,
+                             &zero,
+                             work_dev,
+                             n,
+                             stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                                work_dev,
+                                n * iter_new * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+
+  // Normalize residual to obtain new Lanczos vector
+  RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                                lanczosVecs_dev + IDX(0, iter, n),
+                                n * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+
+  RAFT_CUBLAS_TRY(cublasnrm2(
+    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
+
+  auto h_beta = 1 / beta_host[iter_new - 1];
+  RAFT_CUBLAS_TRY(
+    cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+
+  return 0;
+}
+
+}  // namespace detail
+}  // namespace spectral
+
+namespace detail {
+
+/**
+ * @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param shift On exit, pointer to matrix shift (estimate for
+ *    largest eigenvalue).
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
+  using namespace raft::spectral::detail;
+
+  // Useful constants
+  constexpr value_type_t one  = 1;
+  constexpr value_type_t zero = 0;
+
+  // Matrix dimension
+  index_type_t n = A->nrows_;
+
+  // Shift for implicit restart
+  value_type_t shiftUpper;
+  value_type_t shiftLower;
+
+  // Lanczos iteration counters
+  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+  // Status flags
+  int status;
+
+  // Loop index
+  index_type_t i;
+
+  // Host memory
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
+
+  // -------------------------------------------------------
+  // Check that parameters are valid
+  // -------------------------------------------------------
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // -------------------------------------------------------
+  // Variable initialization
+  // -------------------------------------------------------
+
+  // Total number of Lanczos iterations
+  *totalIter = 0;
+
+  // Allocate host memory
+  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
+  std::vector<value_type_t> work_host_v(4 * restartIter);
+
+  Z_host    = Z_host_v.data();
+  work_host = work_host_v.data();
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // Compute largest eigenvalue to determine shift
+  // -------------------------------------------------------
+
+  // Random number generator
+  curandGenerator_t randGen;
+  // Initialize random number generator
+  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
+
+  curandSetPseudoRandomGeneratorSeed(randGen, seed);
+
+  // Initialize initial Lanczos vector
+  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
+  value_type_t normQ1;
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+
+  auto h_val = 1 / normQ1;
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+  *shift   = 0;
+  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0.0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+
+  // Determine largest eigenvalue
+
+  Lapack<value_type_t>::sterf(*effIter, alpha_host, beta_host);
+  *shift = -alpha_host[*effIter - 1];
+
+  // -------------------------------------------------------
+  // Compute eigenvectors of shifted matrix
+  // -------------------------------------------------------
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+  *totalIter += *effIter;
+
+  // Apply Lanczos method until convergence
+  shiftLower = 1;
+  shiftUpper = -1;
+  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
+    // Determine number of restart steps
+    // Number of steps must be even due to Francis algorithm
+    index_type_t iter_new = nEigVecs + 1;
+    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
+      iter_new = restartIter - (maxIter - *totalIter);
+    if ((restartIter - iter_new) % 2) iter_new -= 1;
+    if (iter_new == *effIter) break;
+
+    // Implicit restart of Lanczos method
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        true);
+    if (status) WARNING("error in Lanczos implicit restart");
+    *effIter = iter_new;
+
+    // Check for convergence
+    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
+
+    // Proceed with Lanczos method
+
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
+    if (status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter - iter_new;
+  }
+
+  // Warning if Lanczos has failed to converge
+  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
+    WARNING("implicitly restarted Lanczos failed to converge");
+  }
+
+  // Solve tridiagonal system
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
+                              work_host);
+
+  // Obtain desired eigenvalues by applying shift
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
+
+  // Copy results to device memory
+  RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                                work_host + 2 * (*effIter),
+                                nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev,
+                                Z_host,
+                                (*effIter) * nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+  CHECK_CUDA(stream);
+
+  // Convert eigenvectors from Lanczos basis to standard basis
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
+
+  // Clean up and exit
+  curandDestroyGenerator(randGen);
+  return 0;
+}
+
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
+  using namespace raft::spectral::detail;
+
+  // Matrix dimension
+  index_type_t n = A.nrows_;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  // Allocate memory
+  std::vector<value_type_t> alpha_host_v(restartIter);
+  std::vector<value_type_t> beta_host_v(restartIter);
+
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
+
+  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+
+  // Perform Lanczos method
+  index_type_t effIter;
+  value_type_t shift;
+  int status = computeSmallestEigenvectors(handle,
+                                           &A,
+                                           nEigVecs,
+                                           maxIter,
+                                           restartIter,
+                                           tol,
+                                           reorthogonalize,
+                                           &effIter,
+                                           &iter,
+                                           &shift,
+                                           alpha_host,
+                                           beta_host,
+                                           lanczosVecs_dev.raw(),
+                                           work_dev.raw(),
+                                           eigVals_dev,
+                                           eigVecs_dev,
+                                           seed);
+
+  // Clean up and return
+  return status;
+}
+
+/**
+ *  @brief Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed.
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
+  using namespace raft::spectral::detail;
+
+  // Useful constants
+  constexpr value_type_t one  = 1;
+  constexpr value_type_t zero = 0;
+
+  // Matrix dimension
+  index_type_t n = A->nrows_;
+
+  // Lanczos iteration counters
+  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+  // Status flags
+  int status;
+
+  // Loop index
+  index_type_t i;
+
+  // Host memory
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
+
+  // -------------------------------------------------------
+  // Check that LAPACK is enabled
+  // -------------------------------------------------------
+  // Lapack<value_type_t>::check_lapack_enabled();
+
+  // -------------------------------------------------------
+  // Check that parameters are valid
+  // -------------------------------------------------------
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // -------------------------------------------------------
+  // Variable initialization
+  // -------------------------------------------------------
+
+  // Total number of Lanczos iterations
+  *totalIter = 0;
+
+  // Allocate host memory
+  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
+  std::vector<value_type_t> work_host_v(4 * restartIter);
+
+  Z_host    = Z_host_v.data();
+  work_host = work_host_v.data();
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // Compute largest eigenvalue
+  // -------------------------------------------------------
+
+  // Random number generator
+  curandGenerator_t randGen;
+  // Initialize random number generator
+  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
+  curandSetPseudoRandomGeneratorSeed(randGen, seed);
+  // Initialize initial Lanczos vector
+  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
+  value_type_t normQ1;
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+
+  auto h_val = 1 / normQ1;
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter               = 0;
+  value_type_t shift_val = 0.0;
+  value_type_t* shift    = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+  *totalIter += *effIter;
+
+  // Apply Lanczos method until convergence
+  value_type_t shiftLower = 1;
+  value_type_t shiftUpper = -1;
+  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
+    // Determine number of restart steps
+    //   Number of steps must be even due to Francis algorithm
+    index_type_t iter_new = nEigVecs + 1;
+    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
+      iter_new = restartIter - (maxIter - *totalIter);
+    if ((restartIter - iter_new) % 2) iter_new -= 1;
+    if (iter_new == *effIter) break;
+
+    // Implicit restart of Lanczos method
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        false);
+    if (status) WARNING("error in Lanczos implicit restart");
+    *effIter = iter_new;
+
+    // Check for convergence
+    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
+
+    // Proceed with Lanczos method
+
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
+    if (status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter - iter_new;
+  }
+
+  // Warning if Lanczos has failed to converge
+  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
+    WARNING("implicitly restarted Lanczos failed to converge");
+  }
+  for (int i = 0; i < restartIter; ++i) {
+    for (int j = 0; j < restartIter; ++j)
+      Z_host[i * restartIter + j] = 0;
+  }
+  // Solve tridiagonal system
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
+                              work_host);
+
+  // note: We need to pick the top nEigVecs eigenvalues
+  // but effItter can be larger than nEigVecs
+  // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the
+  // matrix of size effIter. remember the array is sorted, so it is not needed for smallest
+  // eigenvalues case because the first ones are the smallest ones
+
+  index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs;
+
+  // Debug : print nEigVecs largest eigenvalues
+  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+  //  std::cout <<*(work_host+(2*(*effIter)+i))<< " ";
+  // std::cout <<std::endl;
+
+  // Debug : print nEigVecs largest eigenvectors
+  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+  //{
+  //  for (int j = 0; j < *effIter; ++j)
+  //    std::cout <<Z_host[i*(*effIter)+j]<< " ";
+  //  std::cout <<std::endl;
+  //}
+
+  // Obtain desired eigenvalues by applying shift
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+
+  for (i = 0; i < top_eigenparis_idx_offset; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
+
+  // Copy results to device memory
+  // skip smallest eigenvalue if needed
+  RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                                work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                                nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  // skip smallest eigenvector if needed
+  RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev,
+                                Z_host + (top_eigenparis_idx_offset * (*effIter)),
+                                (*effIter) * nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  CHECK_CUDA(stream);
+
+  // Convert eigenvectors from Lanczos basis to standard basis
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
+
+  // Clean up and exit
+  curandDestroyGenerator(randGen);
+  return 0;
+}
+
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
+  // Matrix dimension
+  index_type_t n = A.nrows_;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  // Allocate memory
+  std::vector<value_type_t> alpha_host_v(restartIter);
+  std::vector<value_type_t> beta_host_v(restartIter);
+
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
+
+  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+
+  // Perform Lanczos method
+  index_type_t effIter;
+  int status = computeLargestEigenvectors(handle,
+                                          &A,
+                                          nEigVecs,
+                                          maxIter,
+                                          restartIter,
+                                          tol,
+                                          reorthogonalize,
+                                          &effIter,
+                                          &iter,
+                                          alpha_host,
+                                          beta_host,
+                                          lanczosVecs_dev.raw(),
+                                          work_dev.raw(),
+                                          eigVals_dev,
+                                          eigVecs_dev,
+                                          seed);
+
+  // Clean up and return
+  return status;
+}
+
+}  // namespace detail
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
new file mode 100644
index 0000000000..56f1dd6f19
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
+{
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
+}
+
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+void mapImpl(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  const int nblks = raft::ceildiv(len, (size_t)TPB);
+  mapKernel<InType, OutType, MapOp, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
similarity index 54%
rename from cpp/include/raft/linalg/map_then_reduce.cuh
rename to cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 2fa3ae72de..281861b2f9 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 struct sum_tag {
 };
@@ -91,67 +92,6 @@ void mapThenReduceImpl(OutType* out,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/**
- * @brief CUDA version of map and then sum reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output sum-reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
-
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
-          typename OutType = InType>
-void mapThenSumReduce(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
-    out, len, (OutType)0, map, sum_tag(), stream, in, args...);
-}
-
-/**
- * @brief CUDA version of map and then generic reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual map operation
- * @tparam ReduceLambda the device-lambda performing the actual reduction
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param neutral The neutral element of the reduction operation. For example:
- *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
- * @param map the device-lambda
- * @param op the reduction device lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
-
-template <typename InType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB          = 256,
-          typename OutType = InType,
-          typename... Args>
-void mapThenReduce(OutType* out,
-                   size_t len,
-                   OutType neutral,
-                   MapOp map,
-                   ReduceLambda op,
-                   cudaStream_t stream,
-                   const InType* in,
-                   Args... args)
-{
-  mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
-    out, len, neutral, map, op, stream, in, args...);
-}
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
new file mode 100644
index 0000000000..94545e59f6
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/matrix/matrix.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+namespace {
+template <size_t VecBytes>
+struct AlignedAccess {
+  template <typename T>
+  static inline bool test(const T* matrix, size_t strideBytes)
+  {
+    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
+           Pow2<sizeof(T)>::isAligned(VecBytes);
+  }
+};
+};  // namespace
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType>
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
+  typedef TxN_t<Type, veclen_> VecType;
+  IdxType len = N * D;
+  IdxType idx = threadIdx.x;
+  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  IdxType vIdx;
+  VecType mat, vec;
+  ///@todo: yikes! use fast-int-div here.
+  ///@todo: shared mem for vector could help with perf
+  if (rowMajor && bcastAlongRows) {
+    vIdx = idx % D;
+    vec.load(vector, vIdx);
+  } else if (!rowMajor && !bcastAlongRows) {
+    vIdx = idx % N;
+    vec.load(vector, vIdx);
+  } else if (rowMajor && !bcastAlongRows) {
+    vIdx = idx / D;
+    vec.fill(vector[vIdx]);
+  } else {
+    vIdx = idx / N;
+    vec.fill(vector[vIdx]);
+  }
+  mat.load(matrix, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i)
+    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
+  mat.store(out, idx);
+}
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
+  IdxType len   = N * D;
+  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
+}
+
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
new file mode 100644
index 0000000000..f0a9daebdb
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/map_then_reduce.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, int TPB = 256>
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
+  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
+    math_t diff = a - b;
+    return diff * diff * weight / len;
+  };
+  raft::linalg::mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp
new file mode 100644
index 0000000000..da06c23aed
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/multiply.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, typename IdxType = int>
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp
new file mode 100644
index 0000000000..fcf98c7daf
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/norm.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/reduce.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+/** different types of norms supported on the input buffers */
+enum NormType { L1Norm = 0, L2Norm };
+
+template <typename Type, typename IdxType, typename Lambda>
+void rowNormCaller(Type* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   NormType type,
+                   bool rowMajor,
+                   cudaStream_t stream,
+                   Lambda fin_op)
+{
+  switch (type) {
+    case L1Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           true,
+                           stream,
+                           false,
+                           raft::L1Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    case L2Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           true,
+                           stream,
+                           false,
+                           raft::L2Op<Type>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+template <typename Type, typename IdxType, typename Lambda>
+void colNormCaller(Type* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   NormType type,
+                   bool rowMajor,
+                   cudaStream_t stream,
+                   Lambda fin_op)
+{
+  switch (type) {
+    case L1Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           false,
+                           stream,
+                           false,
+                           raft::L1Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    case L2Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           false,
+                           stream,
+                           false,
+                           raft::L2Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
similarity index 77%
rename from cpp/include/raft/linalg/qr.cuh
rename to cpp/include/raft/linalg/detail/qr.cuh
index 2870d6d072..a250dd3578 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,30 +16,16 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @defgroup QRdecomp QR decomposition
- * @{
- */
-
-/**
- * @brief compute QR decomp and return only Q matrix
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void qrGetQ(const raft::handle_t& handle,
             const math_t* M,
@@ -64,26 +50,13 @@ void qrGetQ(const raft::handle_t& handle,
   rmm::device_uvector<math_t> workspace(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(
     cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
-  /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
-#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-#endif
+
   RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(
     cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
 }
 
-/**
- * @brief compute QR decomp and return both Q and R matrices
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param R: R matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- */
 template <typename math_t>
 void qrGetQR(const raft::handle_t& handle,
              math_t* M,
@@ -119,10 +92,6 @@ void qrGetQR(const raft::handle_t& handle,
                                     Lwork,
                                     devInfo.data(),
                                     stream));
-  // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
-#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-#endif
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
@@ -145,7 +114,7 @@ void qrGetQR(const raft::handle_t& handle,
                                     devInfo.data(),
                                     stream));
 }
-/** @} */
 
+};  // namespace detail
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp
new file mode 100644
index 0000000000..94c8f5ba52
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+            ReduceLambda reduce_op = raft::Sum<OutType>(),
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
+  if (rowMajor && alongRows) {
+    raft::linalg::coalescedReduction(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (rowMajor && !alongRows) {
+    raft::linalg::stridedReduction(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (!rowMajor && alongRows) {
+    raft::linalg::stridedReduction(
+      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    raft::linalg::coalescedReduction(
+      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
similarity index 80%
rename from cpp/include/raft/linalg/strided_reduction.cuh
rename to cpp/include/raft/linalg/detail/strided_reduction.cuh
index 0434f87151..a0d1e2abaa 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,12 @@
 #include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <type_traits>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 // Kernel to perform reductions along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
@@ -102,33 +104,6 @@ __global__ void stridedReductionKernel(OutType* dots,
     raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op);
 }
 
-/**
- * @brief Compute reduction of the input matrix along the strided dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
- */
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
@@ -177,5 +152,6 @@ void stridedReduction(OutType* dots,
     raft::linalg::unaryOp(dots, dots, D, final_op, stream);
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
new file mode 100644
index 0000000000..23d5eded05
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
+  raft::linalg::unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+}
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
+  raft::linalg::binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+}
+
+template <class math_t, typename IdxType>
+__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+                                           const math_t* inDev,
+                                           const math_t* singleScalarDev,
+                                           IdxType len)
+{
+  // TODO: kernel do not use shared memory in current implementation
+  int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
+}
+
+template <typename math_t, typename IdxType = int, int TPB = 256>
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
+  // Just for the note - there is no way to express such operation with cuBLAS in effective way
+  // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
+  const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
+  subtract_dev_scalar_kernel<math_t>
+    <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/detail/svd.hpp
similarity index 59%
rename from cpp/include/raft/linalg/svd.cuh
rename to cpp/include/raft/linalg/detail/svd.hpp
index 2afae788a1..796adc89ff 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,15 +16,16 @@
 
 #pragma once
 
-#include "eig.cuh"
-#include "gemm.cuh"
-#include "transpose.h"
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/transpose.hpp>
+
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
@@ -32,25 +33,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief singular value decomposition (SVD) on the column major float type
- * input matrix using QR method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular values of input matrix
- * @param right_sing_vecs: right singular values of input matrix
- * @param trans_right: transpose right vectors or not
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param stream cuda stream
- */
-// TODO: activate gen_left_vec and gen_right_vec options
-// TODO: couldn't template this function due to cusolverDnSgesvd and
-// cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
 void svdQR(const raft::handle_t& handle,
            T* in,
@@ -69,15 +53,6 @@ void svdQR(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH       = handle.get_cublas_handle();
 
-#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
-  // 46340: sqrt of max int value
-  ASSERT(n_rows <= 46340,
-         "svd solver is not supported for the data that has more than 46340 "
-         "samples (rows) "
-         "if you are using CUDA version <11. Please use other solvers such as "
-         "eig if it is available.");
-#endif
-
   const int m = n_rows;
   const int n = n_cols;
 
@@ -167,7 +142,7 @@ void svdEig(const raft::handle_t& handle,
                      beta,
                      stream);
 
-  eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
+  raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
 
   raft::matrix::colReverse(V, n_cols, n_cols, stream);
   raft::matrix::rowReverse(S, n_cols, 1, stream);
@@ -192,23 +167,6 @@ void svdEig(const raft::handle_t& handle,
   }
 }
 
-/**
- * @brief on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular vectors of input matrix
- * @param right_sing_vecs: right singular vectors of input matrix
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- * @param stream cuda stream
- */
 template <typename math_t>
 void svdJacobi(const raft::handle_t& handle,
                math_t* in,
@@ -241,57 +199,44 @@ void svdJacobi(const raft::handle_t& handle,
   int lwork = 0;
   int econ  = 1;
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
-                                                              CUSOLVER_EIG_MODE_VECTOR,
-                                                              econ,
-                                                              m,
-                                                              n,
-                                                              in,
-                                                              m,
-                                                              sing_vals,
-                                                              left_sing_vecs,
-                                                              m,
-                                                              right_sing_vecs,
-                                                              n,
-                                                              &lwork,
-                                                              gesvdj_params));
+  RAFT_CUSOLVER_TRY(cusolverDngesvdj_bufferSize(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                econ,
+                                                m,
+                                                n,
+                                                in,
+                                                m,
+                                                sing_vals,
+                                                left_sing_vecs,
+                                                m,
+                                                right_sing_vecs,
+                                                n,
+                                                &lwork,
+                                                gesvdj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH,
-                                                   CUSOLVER_EIG_MODE_VECTOR,
-                                                   econ,
-                                                   m,
-                                                   n,
-                                                   in,
-                                                   m,
-                                                   sing_vals,
-                                                   left_sing_vecs,
-                                                   m,
-                                                   right_sing_vecs,
-                                                   n,
-                                                   d_work.data(),
-                                                   lwork,
-                                                   devInfo.data(),
-                                                   gesvdj_params,
-                                                   stream));
+  RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH,
+                                     CUSOLVER_EIG_MODE_VECTOR,
+                                     econ,
+                                     m,
+                                     n,
+                                     in,
+                                     m,
+                                     sing_vals,
+                                     left_sing_vecs,
+                                     m,
+                                     right_sing_vecs,
+                                     n,
+                                     d_work.data(),
+                                     lwork,
+                                     devInfo.data(),
+                                     gesvdj_params,
+                                     stream));
 
   RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param U: left singular vectors of size n_rows x k
- * @param S: square matrix with singular values on its diagonal, k x k
- * @param V: right singular vectors of size n_cols x k
- * @param out: reconstructed matrix to be returned
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values
- * @param stream cuda stream
- */
 template <typename math_t>
 void svdReconstruction(const raft::handle_t& handle,
                        math_t* U,
@@ -323,20 +268,6 @@ void svdReconstruction(const raft::handle_t& handle,
                      stream);
 }
 
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param A_d: input matrix
- * @param U: left singular vectors of size n_rows x k
- * @param S_vec: singular values as a vector
- * @param V: right singular vectors of size n_cols x k
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values to be computed, 1.0 for normal SVD
- * @param tol: tolerance for the evaluation
- * @param stream cuda stream
- */
 template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t& handle,
                          math_t* A_d,
@@ -374,25 +305,26 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           m,
-                                           n,
-                                           &alpha,
-                                           A_d,
-                                           m,
-                                           &beta,
-                                           P_d.data(),
-                                           m,
-                                           A_minus_P.data(),
-                                           m,
-                                           stream));
+  RAFT_CUBLAS_TRY(cublasgeam(cublasH,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             m,
+                             n,
+                             &alpha,
+                             A_d,
+                             m,
+                             &beta,
+                             P_d.data(),
+                             m,
+                             A_minus_P.data(),
+                             m,
+                             stream));
 
   math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
   math_t percent_error  = 100.0 * norm_A_minus_P / normA;
   return (percent_error / 100.0 < tol);
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/detail/transpose.hpp
similarity index 56%
rename from cpp/include/raft/linalg/transpose.h
rename to cpp/include/raft/linalg/detail/transpose.hpp
index 6b629e73f0..659d3a8ef6 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,15 @@
 
 #pragma once
 
+#include "cublas_wrappers.hpp"
+
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
 #include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param out: output. Transposed input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream: cuda stream
- */
 template <typename math_t>
 void transpose(const raft::handle_t& handle,
                math_t* in,
@@ -47,28 +40,22 @@ void transpose(const raft::handle_t& handle,
 
   const math_t alpha = 1.0;
   const math_t beta  = 0.0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           out_n_rows,
-                                           out_n_cols,
-                                           &alpha,
-                                           in,
-                                           n_rows,
-                                           &beta,
-                                           out,
-                                           out_n_rows,
-                                           out,
-                                           out_n_rows,
-                                           stream));
+  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
+                             out_n_rows,
+                             out_n_cols,
+                             &alpha,
+                             in,
+                             n_rows,
+                             &beta,
+                             out,
+                             out_n_rows,
+                             out,
+                             out_n_rows,
+                             stream));
 }
 
-/**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param inout: input and output matrix
- * @param n: number of rows and columns of input matrix
- * @param stream: cuda stream
- */
 template <typename math_t>
 void transpose(math_t* inout, int n, cudaStream_t stream)
 {
@@ -90,5 +77,6 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
   });
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
similarity index 69%
rename from cpp/include/raft/linalg/unary_op.cuh
rename to cpp/include/raft/linalg/detail/unary_op.cuh
index ae8cff2325..9ddfe79657 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
 __global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
@@ -50,27 +51,12 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/**
- * @brief perform element-wise unary operation in the input array
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
- */
 template <typename InType,
           typename Lambda,
           typename IdxType = int,
           typename OutType = InType,
           int TPB          = 256>
-void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
 {
   if (len <= 0) return;  // silently skip in case of 0 length input
   constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
@@ -99,25 +85,8 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
   if (idx < len) { op(out + idx, idx); }
 }
 
-/**
- * @brief Perform an element-wise unary operation into the output array
- *
- * Compared to `unaryOp()`, this method does not do any reads from any inputs
- *
- * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB     threads-per-block in the final kernel launched
- *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
- * @param[in]  stream cuda stream where to launch work
- */
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 {
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
@@ -125,5 +94,6 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.hpp
similarity index 88%
rename from cpp/include/raft/linalg/divide.cuh
rename to cpp/include/raft/linalg/divide.hpp
index 562a3d8991..6c8480bf19 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,13 @@
 
 #pragma once
 
-#include "unary_op.cuh"
+#include "detail/divide.hpp"
 
 namespace raft {
 namespace linalg {
 
+using detail::divides_scalar;
+
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam math_t data-type upon which the math operation will be performed
@@ -35,8 +37,7 @@ namespace linalg {
 template <typename math_t, typename IdxType = int>
 void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream);
+  detail::divideScalar(out, in, scalar, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
new file mode 100644
index 0000000000..5c465a3a41
--- /dev/null
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/eig.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup eig Eigen Decomposition Methods
+ * @{
+ */
+
+/**
+ * @brief eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
+  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+}
+
+using detail::COPY_INPUT;
+using detail::EigVecMemUsage;
+using detail::OVERWRITE_INPUT;
+
+/**
+ * @brief eig sel decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param n_eig_vals: number of eigenvectors to be generated
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param memUsage: the memory selection for eig vector output
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
+}
+
+/**
+ * @brief overloaded function for eig decomp with Jacobi method for the
+ * column-major symmetric matrices (in parameter)
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream: stream on which this function will be run
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ */
+template <typename math_t>
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol = 1.e-7,
+               int sweeps = 15)
+{
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
+}
+/** @} */  // end of eig
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.hpp
similarity index 75%
rename from cpp/include/raft/linalg/eltwise.cuh
rename to cpp/include/raft/linalg/eltwise.hpp
index 097c3ac218..5c2a97b57d 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,13 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
+#include "detail/eltwise.hpp"
 
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam InType data-type upon which the math operation will be performed
@@ -36,15 +37,15 @@ namespace linalg {
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream);
+  detail::scalarAdd(out, in, scalar, len, stream);
 }
 
+using detail::multiplies_scalar;
+
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream);
+  detail::scalarMultiply(out, in, scalar, len, stream);
 }
 /** @} */
 
@@ -63,50 +64,37 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseAdd(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
+  detail::eltwiseAdd(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseSub(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream);
+  detail::eltwiseSub(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseMultiply(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream);
+  detail::eltwiseMultiply(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivide(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream);
+  detail::eltwiseDivide(out, in1, in2, len, stream);
 }
 
+using detail::divides_check_zero;
+
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out,
-    in1,
-    in2,
-    len,
-    [] __device__(InType a, InType b) {
-      if (b == InType(0.0))
-        return InType(0.0);
-      else
-        return a / b;
-    },
-    stream);
+  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
new file mode 100644
index 0000000000..04ddbb3561
--- /dev/null
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/gemm.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param alpha scalar
+ * @param beta scalar
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+}
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
+  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
+}
+
+/**
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
+ * combinations of operand layouts.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * @tparam T Data type of input/output matrices (float/double)
+ * @param handle raft handle
+ * @param z output matrix of size M rows x N columns
+ * @param x input matrix of size M rows x K columns
+ * @param y input matrix of size K rows x N columns
+ * @param _M number of rows of X and Z
+ * @param _N number of rows of Y and columns of Z
+ * @param _K number of columns of X and rows of Y
+ * @param isZColMajor Storage layout of Z. true = col major, false = row major
+ * @param isXColMajor Storage layout of X. true = col major, false = row major
+ * @param isYColMajor Storage layout of Y. true = col major, false = row major
+ * @param stream cuda stream
+ * @param alpha scalar
+ * @param beta scalar
+ */
+template <typename T>
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
+  detail::gemm(
+    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.hpp
similarity index 73%
rename from cpp/include/raft/linalg/gemv.h
rename to cpp/include/raft/linalg/gemv.hpp
index 9eafb3941a..45766b8c9a 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,8 @@
 
 #pragma once
 
-#include <cublas_v2.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.h>
+#include "detail/gemv.hpp"
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <raft/handle.hpp>
 
@@ -61,20 +60,20 @@ void gemv(const raft::handle_t& handle,
           cudaStream_t stream)
 {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                             trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
-                             m,
-                             n,
-                             alpha,
-                             A,
-                             lda,
-                             x,
-                             incx,
-                             beta,
-                             y,
-                             incy,
-                             stream));
+  detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
+                                     trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     x,
+                                     incx,
+                                     beta,
+                                     y,
+                                     incy,
+                                     stream));
 }
 
 template <typename math_t>
@@ -99,16 +98,17 @@ void gemv(const raft::handle_t& handle,
  *
  * where
  *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -122,7 +122,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
 }
 
 /**
@@ -130,12 +130,15 @@ void gemv(const raft::handle_t& handle,
  *
  * where
  *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -147,31 +150,27 @@ void gemv(const raft::handle_t& handle,
           const bool trans_a,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-
-  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
 }
 
 /**
  * y = alpha * op(A) * x + beta * y
  *
  * where
- *
- * @param alpha is a scalar scale of Ax.
- *
- * @param beta is a scalar scale of y.
- *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
  *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
  *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -186,27 +185,25 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  RAFT_CUBLAS_TRY(
-    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
 }
 
 /**
  * y = op(A) * x
  *
  * where
- *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
  *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
  *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
  *
  */
 template <typename math_t>
@@ -220,9 +217,7 @@ void gemv(const raft::handle_t& handle,
           const bool trans_a,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
 }
 
 };  // namespace linalg
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
new file mode 100644
index 0000000000..bb577672e8
--- /dev/null
+++ b/cpp/include/raft/linalg/init.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/init.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [end-start]
+ * \param [in] start of the range
+ * \param [in] end of range (exclusive)
+ * \param [in] stream cuda stream
+ */
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  detail::range(out, start, end, stream);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  detail::range(out, n, stream);
+}
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 9376994742..e7d965f810 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,982 +16,14 @@
 
 #pragma once
 
-// for cmath:
-#define _USE_MATH_DEFINES
-
-#include <cmath>
-#include <vector>
-
-#include <cuda.h>
-#include <curand.h>
-
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/spectral/lapack.hpp>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include "detail/lanczos.hpp"
 
 namespace raft {
 
-using namespace matrix;
-using namespace linalg;
-
-namespace spectral {
-
-// curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(
-  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
-{
-  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
-}
-inline curandStatus_t curandGenerateNormalX(
-  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
-{
-  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
-}
-
-// =========================================================
-// Helper functions
-// =========================================================
-
-/**
- *  @brief  Perform Lanczos iteration
- *    Lanczos iteration is performed on a shifted matrix A+shift*I.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param iter Pointer to current Lanczos iteration. On exit, the
- *    variable is set equal to the final Lanczos iteration.
- *  @param maxIter Maximum Lanczos iteration. This function will
- *    perform a maximum of maxIter-*iter iterations.
- *  @param shift Matrix shift.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm (i.e. entry in beta_host) is
- *    less than tol.
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param alpha_host (Output, host memory, maxIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, maxIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Input/output, device memory,
- *    n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as
- *    columns of a column-major matrix with dimensions
- *    n x (maxIter+1).
- *  @param work_dev (Output, device memory, maxIter entries)
- *    Workspace. Not needed if full reorthogonalization is disabled.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
-                            sparse_matrix_t<index_type_t, value_type_t> const* A,
-                            index_type_t* iter,
-                            index_type_t maxIter,
-                            value_type_t shift,
-                            value_type_t tol,
-                            bool reorthogonalize,
-                            value_type_t* __restrict__ alpha_host,
-                            value_type_t* __restrict__ beta_host,
-                            value_type_t* __restrict__ lanczosVecs_dev,
-                            value_type_t* __restrict__ work_dev)
-{
-  // -------------------------------------------------------
-  // Variable declaration
-  // -------------------------------------------------------
-
-  // Useful variables
-  constexpr value_type_t one    = 1;
-  constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero   = 0;
-  value_type_t alpha;
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
-
-  index_type_t n = A->nrows_;
-
-  // -------------------------------------------------------
-  // Compute second Lanczos vector
-  // -------------------------------------------------------
-  if (*iter <= 0) {
-    *iter = 1;
-
-    // Apply matrix
-    if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
-                               lanczosVecs_dev,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
-    A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
-
-    // Orthogonalize Lanczos vector
-    RAFT_CUBLAS_TRY(cublasdot(
-      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
-
-    alpha = -alpha_host[0];
-    RAFT_CUBLAS_TRY(cublasaxpy(
-      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
-
-    // Check if Lanczos has converged
-    if (beta_host[0] <= tol) return 0;
-
-    // Normalize Lanczos vector
-    alpha = 1 / beta_host[0];
-    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-  }
-
-  // -------------------------------------------------------
-  // Compute remaining Lanczos vectors
-  // -------------------------------------------------------
-
-  while (*iter < maxIter) {
-    ++(*iter);
-
-    // Apply matrix
-    if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
-                               lanczosVecs_dev + (*iter - 1) * n,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
-
-    // Full reorthogonalization
-    //   "Twice is enough" algorithm per Kahan and Parlett
-    if (reorthogonalize) {
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_T,
-                                 n,
-                                 *iter,
-                                 &one,
-                                 lanczosVecs_dev,
-                                 n,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 &zero,
-                                 work_dev,
-                                 1,
-                                 stream));
-
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_N,
-                                 n,
-                                 *iter,
-                                 &negOne,
-                                 lanczosVecs_dev,
-                                 n,
-                                 work_dev,
-                                 1,
-                                 &one,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
-                               work_dev + (*iter - 1),
-                               sizeof(value_type_t),
-                               cudaMemcpyDeviceToHost,
-                               stream));
-
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_T,
-                                 n,
-                                 *iter,
-                                 &one,
-                                 lanczosVecs_dev,
-                                 n,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 &zero,
-                                 work_dev,
-                                 1,
-                                 stream));
-
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_N,
-                                 n,
-                                 *iter,
-                                 &negOne,
-                                 lanczosVecs_dev,
-                                 n,
-                                 work_dev,
-                                 1,
-                                 &one,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-    }
-
-    // Orthogonalization with 3-term recurrence relation
-    else {
-      RAFT_CUBLAS_TRY(cublasdot(cublas_h,
-                                n,
-                                lanczosVecs_dev + IDX(0, *iter - 1, n),
-                                1,
-                                lanczosVecs_dev + IDX(0, *iter, n),
-                                1,
-                                alpha_host + (*iter - 1),
-                                stream));
-
-      auto alpha = -alpha_host[*iter - 1];
-      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
-                                 n,
-                                 &alpha,
-                                 lanczosVecs_dev + IDX(0, *iter - 1, n),
-                                 1,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-
-      alpha = -beta_host[*iter - 2];
-      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
-                                 n,
-                                 &alpha,
-                                 lanczosVecs_dev + IDX(0, *iter - 2, n),
-                                 1,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-    }
-
-    // Compute residual
-    RAFT_CUBLAS_TRY(cublasnrm2(
-      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
-
-    // Check if Lanczos has converged
-    if (beta_host[*iter - 1] <= tol) break;
-
-    // Normalize Lanczos vector
-    alpha = 1 / beta_host[*iter - 1];
-    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
-  }
-
-  CUDA_TRY(cudaStreamSynchronize(stream));
-
-  return 0;
-}
-
-/**
- *  @brief  Find Householder transform for 3-dimensional system
- *    Given an input vector v=[x,y,z]', this function finds a
- *    Householder transform P such that P*v is a multiple of
- *    e_1=[1,0,0]'. The input vector v is overwritten with the
- *    Householder vector such that P=I-2*v*v'.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param v (Input/output, host memory, 3 entries) Input
- *    3-dimensional vector. On exit, the vector is set to the
- *    Householder vector.
- *  @param Pv (Output, host memory, 1 entry) First entry of P*v
- *    (here v is the input vector). Either equal to ||v||_2 or
- *    -||v||_2.
- *  @param P (Output, host memory, 9 entries) Householder transform
- *    matrix. Matrix dimensions are 3 x 3.
- */
-template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
-{
-  // Compute norm of vector
-  *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-
-  // Choose whether to reflect to e_1 or -e_1
-  //   This choice avoids catastrophic cancellation
-  if (v[0] >= 0) *Pv = -(*Pv);
-  v[0] -= *Pv;
-
-  // Normalize Householder vector
-  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-  if (normHouseholder != 0) {
-    v[0] /= normHouseholder;
-    v[1] /= normHouseholder;
-    v[2] /= normHouseholder;
-  } else {
-    v[0] = 0;
-    v[1] = 0;
-    v[2] = 0;
-  }
-
-  // Construct Householder matrix
-  index_type_t i, j;
-  for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i)
-    P[IDX(i, i, 3)] += 1;
-}
-
-/**
- *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
- *    The Householder transform is pre-applied to the top three rows
- *  of the matrix and post-applied to the left three columns. The
- *  4 x 4 matrix is intended to contain the bulge that is produced
- *  in the Francis QR algorithm.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param v (Input, host memory, 3 entries) Householder vector.
- *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
- */
-template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t* v, value_type_t* A)
-{
-  // Loop indices
-  index_type_t i, j;
-  // Dot product between Householder vector and matrix row/column
-  value_type_t vDotA;
-
-  // Pre-apply Householder transform
-  for (j = 0; j < 4; ++j) {
-    vDotA = 0;
-    for (i = 0; i < 3; ++i)
-      vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i)
-      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
-  }
-
-  // Post-apply Householder transform
-  for (i = 0; i < 4; ++i) {
-    vDotA = 0;
-    for (j = 0; j < 3; ++j)
-      vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j)
-      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
-  }
-}
-
-/**
- *  @brief  Perform one step of Francis QR algorithm
- *    Equivalent to two steps of the classical QR algorithm on a
- *    tridiagonal matrix.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param n Matrix dimension.
- *  @param shift1 QR algorithm shift.
- *  @param shift2 QR algorithm shift.
- *  @param alpha (Input/output, host memory, n entries) Diagonal
- *    entries of tridiagonal matrix.
- *  @param beta (Input/output, host memory, n-1 entries)
- *    Off-diagonal entries of tridiagonal matrix.
- *  @param V (Input/output, host memory, n*n entries) Orthonormal
- *    transforms from previous steps of QR algorithm. Matrix
- *    dimensions are n x n. On exit, the orthonormal transform from
- *    this Francis QR step is post-applied to the matrix.
- *  @param work (Output, host memory, 3*n entries) Workspace.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n,
-                              value_type_t shift1,
-                              value_type_t shift2,
-                              value_type_t* alpha,
-                              value_type_t* beta,
-                              value_type_t* V,
-                              value_type_t* work)
-{
-  // -------------------------------------------------------
-  // Variable declaration
-  // -------------------------------------------------------
-
-  // Temporary storage of 4x4 bulge and Householder vector
-  value_type_t bulge[16];
-
-  // Householder vector
-  value_type_t householder[3];
-  // Householder matrix
-  value_type_t householderMatrix[3 * 3];
-
-  // Shifts are roots of the polynomial p(x)=x^2+b*x+c
-  value_type_t b = -shift1 - shift2;
-  value_type_t c = shift1 * shift2;
-
-  // Loop indices
-  index_type_t i, j, pos;
-  // Temporary variable
-  value_type_t temp;
-
-  // -------------------------------------------------------
-  // Implementation
-  // -------------------------------------------------------
-
-  // Compute initial Householder transform
-  householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
-  householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
-  householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
-
-  // Apply initial Householder transform to create bulge
-  memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i)
-    bulge[IDX(i, i, 4)] = alpha[i];
-  for (i = 0; i < 3; ++i) {
-    bulge[IDX(i + 1, i, 4)] = beta[i];
-    bulge[IDX(i, i + 1, 4)] = beta[i];
-  }
-  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
-  memcpy(V, work, 3 * n * sizeof(value_type_t));
-
-  // Chase bulge to bottom-right of matrix with Householder transforms
-  for (pos = 0; pos < n - 4; ++pos) {
-    // Move to next position
-    alpha[pos]     = bulge[IDX(0, 0, 4)];
-    householder[0] = bulge[IDX(1, 0, 4)];
-    householder[1] = bulge[IDX(2, 0, 4)];
-    householder[2] = bulge[IDX(3, 0, 4)];
-    for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i)
-        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-    bulge[IDX(3, 0, 4)] = 0;
-    bulge[IDX(3, 1, 4)] = 0;
-    bulge[IDX(3, 2, 4)] = beta[pos + 3];
-    bulge[IDX(0, 3, 4)] = 0;
-    bulge[IDX(1, 3, 4)] = 0;
-    bulge[IDX(2, 3, 4)] = beta[pos + 3];
-    bulge[IDX(3, 3, 4)] = alpha[pos + 4];
-
-    // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
-    applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(
-      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
-    memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
-  }
-
-  // Apply penultimate Householder transform
-  //   Values in the last row and column are zero
-  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
-  householder[0] = bulge[IDX(1, 0, 4)];
-  householder[1] = bulge[IDX(2, 0, 4)];
-  householder[2] = bulge[IDX(3, 0, 4)];
-  for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  bulge[IDX(3, 0, 4)] = 0;
-  bulge[IDX(3, 1, 4)] = 0;
-  bulge[IDX(3, 2, 4)] = 0;
-  bulge[IDX(0, 3, 4)] = 0;
-  bulge[IDX(1, 3, 4)] = 0;
-  bulge[IDX(2, 3, 4)] = 0;
-  bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
-  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(
-    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
-  memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
-
-  // Apply final Householder transform
-  //   Values in the last two rows and columns are zero
-  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
-  householder[0] = bulge[IDX(1, 0, 4)];
-  householder[1] = bulge[IDX(2, 0, 4)];
-  householder[2] = 0;
-  for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
-  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(
-    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
-  memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
-
-  // Bulge has been eliminated
-  alpha[n - 2] = bulge[IDX(0, 0, 4)];
-  alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2]  = bulge[IDX(1, 0, 4)];
-
-  return 0;
-}
-
-/**
- *  @brief  Perform implicit restart of Lanczos algorithm
- *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Matrix dimension.
- *  @param iter Current Lanczos iteration.
- *  @param iter_new Lanczos iteration after restart.
- *  @param shiftUpper Pointer (host memory) to upper bound for unwanted
- *    region. Value is ignored if less than *shiftLower. If a
- *    stronger upper bound has been found, the value is updated on
- *    exit.
- *  @param shiftLower Pointer (host memory) to lower bound for unwanted
- *    region. Value is ignored if greater than *shiftUpper. If a
- *    stronger lower bound has been found, the value is updated on
- *    exit.
- *  @param alpha_host (Input/output, host memory, iter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Input/output, host memory, iter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param V_host (Output, host memory, iter*iter entries)
- *    Orthonormal transform used to obtain restarted system. Matrix
- *    dimensions are iter x iter.
- *  @param work_host (Output, host memory, 4*iter entries)
- *    Workspace.
- *  @param lanczosVecs_dev (Input/output, device memory, n*(iter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (iter+1).
- *  @param work_dev (Output, device memory, (n+iter)*iter entries)
- *    Workspace.
- *  @param smallest_eig specifies whether smallest (true) or largest
- *    (false) eigenvalues are to be calculated.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
-                          index_type_t n,
-                          index_type_t iter,
-                          index_type_t iter_new,
-                          value_type_t* shiftUpper,
-                          value_type_t* shiftLower,
-                          value_type_t* __restrict__ alpha_host,
-                          value_type_t* __restrict__ beta_host,
-                          value_type_t* __restrict__ V_host,
-                          value_type_t* __restrict__ work_host,
-                          value_type_t* __restrict__ lanczosVecs_dev,
-                          value_type_t* __restrict__ work_dev,
-                          bool smallest_eig)
-{
-  // -------------------------------------------------------
-  // Variable declaration
-  // -------------------------------------------------------
-
-  // Useful constants
-  constexpr value_type_t zero = 0;
-  constexpr value_type_t one  = 1;
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // Loop index
-  index_type_t i;
-
-  // Number of implicit restart steps
-  //   Assumed to be even since each call to Francis algorithm is
-  //   equivalent to two calls of QR algorithm
-  index_type_t restartSteps = iter - iter_new;
-
-  // Ritz values from Lanczos method
-  value_type_t* ritzVals_host = work_host + 3 * iter;
-  // Shifts for implicit restart
-  value_type_t* shifts_host;
-
-  // Orthonormal matrix for similarity transform
-  value_type_t* V_dev = work_dev + n * iter;
-
-  // -------------------------------------------------------
-  // Implementation
-  // -------------------------------------------------------
-
-  // Compute Ritz values
-  memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t));
-  memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::sterf(iter, ritzVals_host, work_host);
-
-  // Debug: Print largest eigenvalues
-  // for (int i = iter-iter_new; i < iter; ++i)
-  //  std::cout <<*(ritzVals_host+i)<< " ";
-  // std::cout <<std::endl;
-
-  // Initialize similarity transform with identity matrix
-  memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i)
-    V_host[IDX(i, i, iter)] = 1;
-
-  // Determine interval to suppress eigenvalues
-  if (smallest_eig) {
-    if (*shiftLower > *shiftUpper) {
-      *shiftUpper = ritzVals_host[iter - 1];
-      *shiftLower = ritzVals_host[iter_new];
-    } else {
-      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]);
-      *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]);
-    }
-  } else {
-    if (*shiftLower > *shiftUpper) {
-      *shiftUpper = ritzVals_host[iter - iter_new - 1];
-      *shiftLower = ritzVals_host[0];
-    } else {
-      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]);
-      *shiftLower = std::min(*shiftLower, ritzVals_host[0]);
-    }
-  }
-
-  // Calculate Chebyshev nodes as shifts
-  shifts_host = ritzVals_host;
-  for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
-    shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
-    shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
-  }
-
-  // Apply Francis QR algorithm to implicitly restart Lanczos
-  for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(
-          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
-      WARNING("error in implicitly shifted QR algorithm");
-
-  // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(
-    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                             CUBLAS_OP_N,
-                             n,
-                             iter,
-                             beta_host + iter_new - 1,
-                             lanczosVecs_dev,
-                             n,
-                             V_dev + IDX(0, iter_new, iter),
-                             1,
-                             beta_host + iter - 1,
-                             lanczosVecs_dev + IDX(0, iter, n),
-                             1,
-                             stream));
-
-  // Obtain new Lanczos vectors
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             n,
-                             iter_new,
-                             iter,
-                             &one,
-                             lanczosVecs_dev,
-                             n,
-                             V_dev,
-                             iter,
-                             &zero,
-                             work_dev,
-                             n,
-                             stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
-                           work_dev,
-                           n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
-
-  // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
-                           lanczosVecs_dev + IDX(0, iter, n),
-                           n * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
-
-  RAFT_CUBLAS_TRY(cublasnrm2(
-    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
-
-  auto h_beta = 1 / beta_host[iter_new - 1];
-  RAFT_CUBLAS_TRY(
-    cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
-
-  return 0;
-}
-
-}  // namespace spectral
-
 // =========================================================
 // Eigensolver
 // =========================================================
 
-/**
- * @brief  Compute smallest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are smallest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the smallest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param effIter On exit, pointer to final size of Lanczos system.
- *  @param totalIter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param shift On exit, pointer to matrix shift (estimate for
- *    largest eigenvalue).
- *  @param alpha_host (Output, host memory, restartIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, restartIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (restartIter+1).
- *  @param work_dev (Output, device memory,
- *    (n+restartIter)*restartIter entries) Workspace.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to smallest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const* A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t* effIter,
-                                index_type_t* totalIter,
-                                value_type_t* shift,
-                                value_type_t* __restrict__ alpha_host,
-                                value_type_t* __restrict__ beta_host,
-                                value_type_t* __restrict__ lanczosVecs_dev,
-                                value_type_t* __restrict__ work_dev,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed)
-{
-  using namespace spectral;
-
-  // Useful constants
-  constexpr value_type_t one  = 1;
-  constexpr value_type_t zero = 0;
-
-  // Matrix dimension
-  index_type_t n = A->nrows_;
-
-  // Shift for implicit restart
-  value_type_t shiftUpper;
-  value_type_t shiftLower;
-
-  // Lanczos iteration counters
-  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
-
-  // Status flags
-  int status;
-
-  // Loop index
-  index_type_t i;
-
-  // Host memory
-  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t* work_host;  // Workspace
-
-  // -------------------------------------------------------
-  // Check that parameters are valid
-  // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // -------------------------------------------------------
-  // Variable initialization
-  // -------------------------------------------------------
-
-  // Total number of Lanczos iterations
-  *totalIter = 0;
-
-  // Allocate host memory
-  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
-  std::vector<value_type_t> work_host_v(4 * restartIter);
-
-  Z_host    = Z_host_v.data();
-  work_host = work_host_v.data();
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // -------------------------------------------------------
-  // Compute largest eigenvalue to determine shift
-  // -------------------------------------------------------
-
-  // Random number generator
-  curandGenerator_t randGen;
-  // Initialize random number generator
-  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
-
-  curandSetPseudoRandomGeneratorSeed(randGen, seed);
-
-  // Initialize initial Lanczos vector
-  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
-  value_type_t normQ1;
-  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
-
-  auto h_val = 1 / normQ1;
-  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
-
-  // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
-  *shift   = 0;
-  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0.0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
-  if (status) WARNING("error in Lanczos iteration");
-
-  // Determine largest eigenvalue
-
-  Lapack<value_type_t>::sterf(*effIter, alpha_host, beta_host);
-  *shift = -alpha_host[*effIter - 1];
-
-  // -------------------------------------------------------
-  // Compute eigenvectors of shifted matrix
-  // -------------------------------------------------------
-
-  // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
-  if (status) WARNING("error in Lanczos iteration");
-  *totalIter += *effIter;
-
-  // Apply Lanczos method until convergence
-  shiftLower = 1;
-  shiftUpper = -1;
-  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
-    // Determine number of restart steps
-    // Number of steps must be even due to Francis algorithm
-    index_type_t iter_new = nEigVecs + 1;
-    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
-      iter_new = restartIter - (maxIter - *totalIter);
-    if ((restartIter - iter_new) % 2) iter_new -= 1;
-    if (iter_new == *effIter) break;
-
-    // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(handle,
-                                                        n,
-                                                        *effIter,
-                                                        iter_new,
-                                                        &shiftUpper,
-                                                        &shiftLower,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        Z_host,
-                                                        work_host,
-                                                        lanczosVecs_dev,
-                                                        work_dev,
-                                                        true);
-    if (status) WARNING("error in Lanczos implicit restart");
-    *effIter = iter_new;
-
-    // Check for convergence
-    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
-
-    // Proceed with Lanczos method
-
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
-    if (status) WARNING("error in Lanczos iteration");
-    *totalIter += *effIter - iter_new;
-  }
-
-  // Warning if Lanczos has failed to converge
-  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
-    WARNING("implicitly restarted Lanczos failed to converge");
-  }
-
-  // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I',
-                              *effIter,
-                              work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter),
-                              Z_host,
-                              *effIter,
-                              work_host);
-
-  // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i)
-    work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i)
-    work_host[i + 2 * (*effIter)] = 0;
-
-  // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter),
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host,
-                           (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-  CHECK_CUDA(stream);
-
-  // Convert eigenvectors from Lanczos basis to standard basis
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             n,
-                             nEigVecs,
-                             *effIter,
-                             &one,
-                             lanczosVecs_dev,
-                             n,
-                             work_dev,
-                             *effIter,
-                             &zero,
-                             eigVecs_dev,
-                             n,
-                             stream));
-
-  // Clean up and exit
-  curandDestroyGenerator(randGen);
-  return 0;
-}
-
 /**
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
@@ -1042,344 +74,17 @@ int computeSmallestEigenvectors(handle_t const& handle,
                                 value_type_t* __restrict__ eigVecs_dev,
                                 unsigned long long seed = 1234567)
 {
-  using namespace spectral;
-
-  // Matrix dimension
-  index_type_t n = A.nrows_;
-
-  // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  // Allocate memory
-  std::vector<value_type_t> alpha_host_v(restartIter);
-  std::vector<value_type_t> beta_host_v(restartIter);
-
-  value_type_t* alpha_host = alpha_host_v.data();
-  value_type_t* beta_host  = beta_host_v.data();
-
-  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
-  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
-
-  // Perform Lanczos method
-  index_type_t effIter;
-  value_type_t shift;
-  int status = computeSmallestEigenvectors(handle,
-                                           &A,
-                                           nEigVecs,
-                                           maxIter,
-                                           restartIter,
-                                           tol,
-                                           reorthogonalize,
-                                           &effIter,
-                                           &iter,
-                                           &shift,
-                                           alpha_host,
-                                           beta_host,
-                                           lanczosVecs_dev.raw(),
-                                           work_dev.raw(),
-                                           eigVals_dev,
-                                           eigVecs_dev,
-                                           seed);
-
-  // Clean up and return
-  return status;
-}
-
-// =========================================================
-// Eigensolver
-// =========================================================
-
-/**
- *  @brief Compute largest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are largest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the largest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th largest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param effIter On exit, pointer to final size of Lanczos system.
- *  @param totalIter On exit, pointer to total number of Lanczos
- *    iterations performed.
- *  @param alpha_host (Output, host memory, restartIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, restartIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (restartIter+1).
- *  @param work_dev (Output, device memory,
- *    (n+restartIter)*restartIter entries) Workspace.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to largest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const* A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t* effIter,
-                               index_type_t* totalIter,
-                               value_type_t* __restrict__ alpha_host,
-                               value_type_t* __restrict__ beta_host,
-                               value_type_t* __restrict__ lanczosVecs_dev,
-                               value_type_t* __restrict__ work_dev,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed)
-{
-  using namespace spectral;
-
-  // Useful constants
-  constexpr value_type_t one  = 1;
-  constexpr value_type_t zero = 0;
-
-  // Matrix dimension
-  index_type_t n = A->nrows_;
-
-  // Lanczos iteration counters
-  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
-
-  // Status flags
-  int status;
-
-  // Loop index
-  index_type_t i;
-
-  // Host memory
-  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t* work_host;  // Workspace
-
-  // -------------------------------------------------------
-  // Check that LAPACK is enabled
-  // -------------------------------------------------------
-  // Lapack<value_type_t>::check_lapack_enabled();
-
-  // -------------------------------------------------------
-  // Check that parameters are valid
-  // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // -------------------------------------------------------
-  // Variable initialization
-  // -------------------------------------------------------
-
-  // Total number of Lanczos iterations
-  *totalIter = 0;
-
-  // Allocate host memory
-  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
-  std::vector<value_type_t> work_host_v(4 * restartIter);
-
-  Z_host    = Z_host_v.data();
-  work_host = work_host_v.data();
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // -------------------------------------------------------
-  // Compute largest eigenvalue
-  // -------------------------------------------------------
-
-  // Random number generator
-  curandGenerator_t randGen;
-  // Initialize random number generator
-  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
-  curandSetPseudoRandomGeneratorSeed(randGen, seed);
-  // Initialize initial Lanczos vector
-  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
-  value_type_t normQ1;
-  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
-
-  auto h_val = 1 / normQ1;
-  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
-
-  // Obtain tridiagonal matrix with Lanczos
-  *effIter               = 0;
-  value_type_t shift_val = 0.0;
-  value_type_t* shift    = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
-  if (status) WARNING("error in Lanczos iteration");
-  *totalIter += *effIter;
-
-  // Apply Lanczos method until convergence
-  value_type_t shiftLower = 1;
-  value_type_t shiftUpper = -1;
-  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
-    // Determine number of restart steps
-    //   Number of steps must be even due to Francis algorithm
-    index_type_t iter_new = nEigVecs + 1;
-    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
-      iter_new = restartIter - (maxIter - *totalIter);
-    if ((restartIter - iter_new) % 2) iter_new -= 1;
-    if (iter_new == *effIter) break;
-
-    // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(handle,
-                                                        n,
-                                                        *effIter,
-                                                        iter_new,
-                                                        &shiftUpper,
-                                                        &shiftLower,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        Z_host,
-                                                        work_host,
-                                                        lanczosVecs_dev,
-                                                        work_dev,
-                                                        false);
-    if (status) WARNING("error in Lanczos implicit restart");
-    *effIter = iter_new;
-
-    // Check for convergence
-    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
-
-    // Proceed with Lanczos method
-
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
-    if (status) WARNING("error in Lanczos iteration");
-    *totalIter += *effIter - iter_new;
-  }
-
-  // Warning if Lanczos has failed to converge
-  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
-    WARNING("implicitly restarted Lanczos failed to converge");
-  }
-  for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j)
-      Z_host[i * restartIter + j] = 0;
-  }
-  // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I',
-                              *effIter,
-                              work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter),
-                              Z_host,
-                              *effIter,
-                              work_host);
-
-  // note: We need to pick the top nEigVecs eigenvalues
-  // but effItter can be larger than nEigVecs
-  // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the
-  // matrix of size effIter. remember the array is sorted, so it is not needed for smallest
-  // eigenvalues case because the first ones are the smallest ones
-
-  index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs;
-
-  // Debug : print nEigVecs largest eigenvalues
-  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
-  //  std::cout <<*(work_host+(2*(*effIter)+i))<< " ";
-  // std::cout <<std::endl;
-
-  // Debug : print nEigVecs largest eigenvectors
-  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
-  //{
-  //  for (int j = 0; j < *effIter; ++j)
-  //    std::cout <<Z_host[i*(*effIter)+j]<< " ";
-  //  std::cout <<std::endl;
-  //}
-
-  // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i)
-    work_host[i + 2 * (*effIter)] -= *shift;
-
-  for (i = 0; i < top_eigenparis_idx_offset; ++i)
-    work_host[i + 2 * (*effIter)] = 0;
-
-  // Copy results to device memory
-  // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  // skip smallest eigenvector if needed
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host + (top_eigenparis_idx_offset * (*effIter)),
-                           (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  CHECK_CUDA(stream);
-
-  // Convert eigenvectors from Lanczos basis to standard basis
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             n,
-                             nEigVecs,
-                             *effIter,
-                             &one,
-                             lanczosVecs_dev,
-                             n,
-                             work_dev,
-                             *effIter,
-                             &zero,
-                             eigVecs_dev,
-                             n,
-                             stream));
-
-  // Clean up and exit
-  curandDestroyGenerator(randGen);
-  return 0;
+  return detail::computeSmallestEigenvectors(handle,
+                                             A,
+                                             nEigVecs,
+                                             maxIter,
+                                             restartIter,
+                                             tol,
+                                             reorthogonalize,
+                                             iter,
+                                             eigVals_dev,
+                                             eigVecs_dev,
+                                             seed);
 }
 
 /**
@@ -1432,47 +137,17 @@ int computeLargestEigenvectors(handle_t const& handle,
                                value_type_t* __restrict__ eigVecs_dev,
                                unsigned long long seed = 123456)
 {
-  // Matrix dimension
-  index_type_t n = A.nrows_;
-
-  // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  // Allocate memory
-  std::vector<value_type_t> alpha_host_v(restartIter);
-  std::vector<value_type_t> beta_host_v(restartIter);
-
-  value_type_t* alpha_host = alpha_host_v.data();
-  value_type_t* beta_host  = beta_host_v.data();
-
-  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
-  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
-
-  // Perform Lanczos method
-  index_type_t effIter;
-  int status = computeLargestEigenvectors(handle,
-                                          &A,
-                                          nEigVecs,
-                                          maxIter,
-                                          restartIter,
-                                          tol,
-                                          reorthogonalize,
-                                          &effIter,
-                                          &iter,
-                                          alpha_host,
-                                          beta_host,
-                                          lanczosVecs_dev.raw(),
-                                          work_dev.raw(),
-                                          eigVals_dev,
-                                          eigVecs_dev,
-                                          seed);
-
-  // Clean up and return
-  return status;
+  return detail::computeLargestEigenvectors(handle,
+                                            A,
+                                            nEigVecs,
+                                            maxIter,
+                                            restartIter,
+                                            tol,
+                                            reorthogonalize,
+                                            iter,
+                                            eigVals_dev,
+                                            eigVecs_dev,
+                                            seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.hpp
similarity index 61%
rename from cpp/include/raft/linalg/map.cuh
rename to cpp/include/raft/linalg/map.hpp
index 4facc5e72c..febeaa8621 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,32 +16,11 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/vectorized.cuh>
+#include "detail/map.cuh"
 
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
-__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
-{
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
-
-  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
-}
-
-template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
-void mapImpl(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  const int nblks = raft::ceildiv(len, (size_t)TPB);
-  mapKernel<InType, OutType, MapOp, TPB, Args...>
-    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
 /**
  * @brief CUDA version of map
  * @tparam InType data-type upon which the math operation will be performed
@@ -64,7 +43,7 @@ template <typename InType,
           typename OutType = InType>
 void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
 {
-  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
+  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
new file mode 100644
index 0000000000..04275995a0
--- /dev/null
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/map_then_reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version of map and then sum reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output sum-reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
+          typename OutType = InType>
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
+    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
+}
+
+/**
+ * @brief CUDA version of map and then generic reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual map operation
+ * @tparam ReduceLambda the device-lambda performing the actual reduction
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param neutral The neutral element of the reduction operation. For example:
+ *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
+ * @param map the device-lambda
+ * @param op the reduction device lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
+    out, len, neutral, map, op, stream, in, args...);
+}
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.hpp
similarity index 89%
rename from cpp/include/raft/linalg/matrix_vector_op.cuh
rename to cpp/include/raft/linalg/matrix_vector_op.hpp
index 750eca0742..b9790ebce2 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/matrix/matrix.hpp>
+#include "detail/matrix_vector_op.cuh"
 
 namespace raft {
 namespace linalg {
@@ -55,10 +55,7 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride = rowMajor ? D : N;
-  IdxType nLines = rowMajor ? N : D;
-  return matrix::linewiseOp(
-    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
+  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
 /**
@@ -97,10 +94,7 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride = rowMajor ? D : N;
-  IdxType nLines = rowMajor ? N : D;
-  return matrix::linewiseOp(
-    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
+  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.hpp
similarity index 82%
rename from cpp/include/raft/linalg/mean_squared_error.cuh
rename to cpp/include/raft/linalg/mean_squared_error.hpp
index a3fcc5bac6..42af8642b6 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "map_then_reduce.cuh"
+#include "detail/mean_squared_error.hpp"
 
 namespace raft {
 namespace linalg {
@@ -36,11 +36,7 @@ template <typename math_t, int TPB = 256>
 void meanSquaredError(
   math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
 {
-  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
-    math_t diff = a - b;
-    return diff * diff * weight / len;
-  };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+  detail::meanSquaredError(out, A, B, len, weight, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.hpp
similarity index 88%
rename from cpp/include/raft/linalg/multiply.cuh
rename to cpp/include/raft/linalg/multiply.hpp
index 53d57ecd00..4a1628b44a 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "unary_op.cuh"
+#include "detail/multiply.hpp"
 
 namespace raft {
 namespace linalg {
@@ -35,8 +35,7 @@ namespace linalg {
 template <typename math_t, typename IdxType = int>
 void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+  detail::multiplyScalar(out, in, scalar, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.hpp
similarity index 65%
rename from cpp/include/raft/linalg/norm.cuh
rename to cpp/include/raft/linalg/norm.hpp
index 82558c8023..a6336769ca 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,15 @@
 
 #pragma once
 
-#include "reduce.cuh"
+#include "detail/norm.hpp"
 
 namespace raft {
 namespace linalg {
 
 /** different types of norms supported on the input buffers */
-enum NormType { L1Norm = 0, L2Norm };
+using detail::L1Norm;
+using detail::L2Norm;
+using detail::NormType;
 
 /**
  * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
@@ -54,37 +56,7 @@ void rowNorm(Type* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::Nop<Type, IdxType>())
 {
-  switch (type) {
-    case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L2Op<Type>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
-  };
+  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
 /**
@@ -111,37 +83,7 @@ void colNorm(Type* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::Nop<Type, IdxType>())
 {
-  switch (type) {
-    case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L2Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
-  };
+  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
new file mode 100644
index 0000000000..50e97e4069
--- /dev/null
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/qr.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup QRdecomp QR decomposition
+ * @{
+ */
+
+/**
+ * @brief compute QR decomp and return only Q matrix
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
+  detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief compute QR decomp and return both Q and R matrices
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param R: R matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
+  detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.hpp
similarity index 81%
rename from cpp/include/raft/linalg/reduce.cuh
rename to cpp/include/raft/linalg/reduce.hpp
index 1f14f6eb31..1c4ef70df8 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include "coalesced_reduction.cuh"
-#include "strided_reduction.cuh"
-#include <raft/cuda_utils.cuh>
+#include "detail/reduce.hpp"
 
 namespace raft {
 namespace linalg {
@@ -71,15 +69,8 @@ void reduce(OutType* dots,
             ReduceLambda reduce_op = raft::Sum<OutType>(),
             FinalLambda final_op   = raft::Nop<OutType>())
 {
-  if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
-  }
+  detail::reduce(
+    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
new file mode 100644
index 0000000000..0f97323e5a
--- /dev/null
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/strided_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the strided dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.hpp
similarity index 68%
rename from cpp/include/raft/linalg/subtract.cuh
rename to cpp/include/raft/linalg/subtract.hpp
index b33378bf33..9d48948cad 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
-#include <raft/cuda_utils.cuh>
+#include "detail/subtract.cuh"
 
 namespace raft {
 namespace linalg {
@@ -40,8 +38,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  detail::subtractScalar(out, in, scalar, len, stream);
 }
 
 /**
@@ -60,19 +57,7 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
-}
-
-template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t* outDev,
-                                           const math_t* inDev,
-                                           const math_t* singleScalarDev,
-                                           IdxType len)
-{
-  // TODO: kernel do not use shared memory in current implementation
-  int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
+  detail::subtract(out, in1, in2, len, stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
@@ -93,12 +78,7 @@ void subtractDevScalar(math_t* outDev,
                        IdxType len,
                        cudaStream_t stream)
 {
-  // Just for the note - there is no way to express such operation with cuBLAS in effective way
-  // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
-  const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
-  subtract_dev_scalar_kernel<math_t>
-    <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
new file mode 100644
index 0000000000..a30180b174
--- /dev/null
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/svd.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief singular value decomposition (SVD) on the column major float type
+ * input matrix using QR method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular values of input matrix
+ * @param right_sing_vecs: right singular values of input matrix
+ * @param trans_right: transpose right vectors or not
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param stream cuda stream
+ */
+// TODO: activate gen_left_vec and gen_right_vec options
+// TODO: couldn't template this function due to cusolverDnSgesvd and
+// cusolverSnSgesvd. Check if there is any other way.
+template <typename T>
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
+  detail::svdQR(handle,
+                in,
+                n_rows,
+                n_cols,
+                sing_vals,
+                left_sing_vecs,
+                right_sing_vecs,
+                trans_right,
+                gen_left_vec,
+                gen_right_vec,
+                stream);
+}
+
+template <typename T>
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
+  detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
+}
+
+/**
+ * @brief on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular vectors of input matrix
+ * @param right_sing_vecs: right singular vectors of input matrix
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
+  detail::svdJacobi(handle,
+                    in,
+                    n_rows,
+                    n_cols,
+                    sing_vals,
+                    left_sing_vecs,
+                    right_sing_vecs,
+                    gen_left_vec,
+                    gen_right_vec,
+                    tol,
+                    max_sweeps,
+                    stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param U: left singular vectors of size n_rows x k
+ * @param S: square matrix with singular values on its diagonal, k x k
+ * @param V: right singular vectors of size n_cols x k
+ * @param out: reconstructed matrix to be returned
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
+  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param A_d: input matrix
+ * @param U: left singular vectors of size n_rows x k
+ * @param S_vec: singular values as a vector
+ * @param V: right singular vectors of size n_cols x k
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values to be computed, 1.0 for normal SVD
+ * @param tol: tolerance for the evaluation
+ * @param stream cuda stream
+ */
+template <typename math_t>
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
+  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
new file mode 100644
index 0000000000..50608877fa
--- /dev/null
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/transpose.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param out: output. Transposed input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
+  detail::transpose(handle, in, out, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param inout: input and output matrix
+ * @param n: number of rows and columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  detail::transpose(inout, n, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
new file mode 100644
index 0000000000..51faa2e4a4
--- /dev/null
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise unary operation in the input array
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in the input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::unaryOpCaller(out, in, len, op, stream);
+}
+
+/**
+ * @brief Perform an element-wise unary operation into the output array
+ *
+ * Compared to `unaryOp()`, this method does not do any reads from any inputs
+ *
+ * @tparam OutType output data-type
+ * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB     threads-per-block in the final kernel launched
+ *
+ * @param[out] out    the output array [on device] [len = len]
+ * @param[in]  len    number of elements in the input array
+ * @param[in]  op     the device-lambda which must be of the form:
+ *                    `void func(OutType* outLocationOffset, IdxType idx);`
+ *                    where outLocationOffset will be out + idx.
+ * @param[in]  stream cuda stream where to launch work
+ */
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 95103ab98e..6b32cbc06e 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -20,10 +20,10 @@
 
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index f9cfffe64d..6d631b4f4f 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -29,7 +29,7 @@
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 namespace raft {
 namespace matrix {
@@ -278,7 +278,8 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  // #TODO: Call from the public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 07bf251f14..f65f524f62 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 00054a8e96..b1e9ae5a55 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -19,7 +19,6 @@
 #include <limits.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <raft/sparse/convert/csr.hpp>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 7f63a7fec8..57411f6998 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -20,8 +20,8 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 1e907c98eb..4ceb31a3c8 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 2f121dce33..7ec032d186 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -20,7 +20,7 @@
 #include <unordered_set>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 5d4640f4a6..bd96ca8649 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -20,10 +20,10 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h
index 3c47d19a0b..0f4f50ceb6 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.h
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 817b9782f2..b56b2df02e 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/distance/fused_l2_nn.hpp>
 #include <raft/label/classlabels.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index de0a15c029..3de10a2782 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -20,8 +20,8 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index 1191251039..6ac96e1324 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -24,7 +24,7 @@
 
 #include <raft/spatial/knn/knn.hpp>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index ddfb74dedc..8b2747d104 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/selection/detail/knn.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 6d7cb826da..825761d44d 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/detail/knn_graph.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 79f75dc8ae..339ca3687a 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index a4bdb8f2de..5b93439218 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -21,7 +21,7 @@
 #include "ball_cover_common.h"
 #include "detail/ball_cover.cuh"
 #include "detail/ball_cover/common.cuh"
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <thrust/transform.h>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index 2830b81cc0..e1a202107b 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cstdint>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 1c2f21b72c..153b6b1d8a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -43,7 +43,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <cuml/neighbors/knn.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 328ec0bf81..587505316b 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 385e16383e..6b5df01a97 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -17,7 +17,7 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include "processing.hpp"
 #include <raft/distance/detail/distance.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 8d40860535..06473f6151 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -25,8 +25,8 @@
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/utils/Heap.h>
 
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 98a8885369..d5dfe4f8f8 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -30,8 +30,8 @@
 
 #include <cstdint>
 #include <iostream>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index f87fffc6cf..a515ca8507 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,10 +15,10 @@
  */
 #pragma once
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/stats/mean.hpp>
 #include <raft/stats/mean_center.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index bcb28bfb1e..56f4022a8c 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -31,7 +31,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
 
@@ -657,20 +657,21 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                             CUBLAS_OP_T,
-                             CUBLAS_OP_N,
-                             n,
-                             d,
-                             &one,
-                             obs,
-                             d,
-                             &zero,
-                             (value_type_t*)NULL,
-                             n,
-                             thrust::raw_pointer_cast(obs_copy),
-                             n,
-                             stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_OP_N,
+                                                   n,
+                                                   d,
+                                                   &one,
+                                                   obs,
+                                                   d,
+                                                   &zero,
+                                                   (value_type_t*)NULL,
+                                                   n,
+                                                   thrust::raw_pointer_cast(obs_copy),
+                                                   n,
+                                                   stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
@@ -852,7 +853,9 @@ int kmeans(handle_t const& handle,
   }
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index 891c367d5e..d066c68a68 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -18,8 +18,8 @@
 #include <cusolverDn.h>
 
 #include <raft/error.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 
 // for now; TODO: check if/where this `define` should be;
 //
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index a260e75505..75f0121795 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -17,7 +17,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
@@ -349,7 +349,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     if (beta == 0) {
       CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
-      RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream));
+      // TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
     }
 
     // Apply diagonal matrix
@@ -412,7 +413,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h,
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasdot(cublas_h,
                                       n,
                                       laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
                                       1,
@@ -424,7 +427,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h,
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasaxpy(cublas_h,
                                        n,
                                        &gamma_,
                                        laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index c61b5f1458..8188a772b8 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -160,7 +160,9 @@ void analyzeModularity(handle_t const& handle,
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  // #TODO: Use public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index a30906de10..6b57566a73 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -18,6 +18,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -132,7 +133,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::minus<weight_t>());
     RAFT_CHECK_CUDA(stream);
 
-    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -149,22 +152,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                               CUBLAS_OP_T,
-                               CUBLAS_OP_N,
-                               nEigVecs,
-                               n,
-                               &one,
-                               eigVecs,
-                               n,
-                               &zero,
-                               (weight_t*)NULL,
-                               nEigVecs,
-                               work.raw(),
-                               nEigVecs,
-                               stream));
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     nEigVecs,
+                                                     n,
+                                                     &one,
+                                                     eigVecs,
+                                                     n,
+                                                     &zero,
+                                                     (weight_t*)NULL,
+                                                     nEigVecs,
+                                                     work.raw(),
+                                                     nEigVecs,
+                                                     stream));
 
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
@@ -216,14 +222,18 @@ bool construct_indicator(handle_t const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+    cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
   if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 899e378d38..a512579c11 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index 1a4fc20c51..db2eaf8459 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/vectorized.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index ed411ef74d..e3f586fea8 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft::stats::detail {
 
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 229eb34a7d..c07c212e54 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 1db504965c..ad46c3bf10 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 27908d8f15..176922529f 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/distance/fused_l2_nn.hpp>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index b54e69df36..c277db76ee 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -18,7 +18,7 @@
 #include "add.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 70e4866407..1f1ff87a4d 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 2cf0679849..55810a5ca0 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -18,7 +18,7 @@
 #include "binary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index c2ba8c18ee..b9ca9f8fd2 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 9f07341c33..9f44cc8d5f 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cholesky_r1_update.cuh>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/cholesky_r1_update.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -42,7 +42,8 @@ class CholeskyR1Test : public ::testing::Test {
 
     // Allocate workspace
     solver_handle = handle.get_cusolver_dn_handle();
-    RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize(
+    // TODO: Call from public API when ready
+    RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
       solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork));
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
@@ -72,15 +73,16 @@ class CholeskyR1Test : public ::testing::Test {
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, handle.get_stream());
-        RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle,
-                                                        uplo,
-                                                        rank,
-                                                        L_exp.data(),
-                                                        n_rows,
-                                                        (math_t*)workspace.data(),
-                                                        Lwork,
-                                                        devInfo.data(),
-                                                        handle.get_stream()));
+        // TODO: Call from public API when ready
+        RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(solver_handle,
+                                                                uplo,
+                                                                rank,
+                                                                L_exp.data(),
+                                                                n_rows,
+                                                                (math_t*)workspace.data(),
+                                                                Lwork,
+                                                                devInfo.data(),
+                                                                handle.get_stream()));
 
         // Incremental Cholesky factorization using rank one updates.
         raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index f7bd0b60a4..1d1a4fd864 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 51173c07f0..f07a7d05ce 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -18,7 +18,7 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.cuh>
+#include <raft/linalg/divide.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 83bca6198d..9100a3a5f6 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.cuh>
+#include <raft/linalg/eig.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b46efc38fd..4ae2653e47 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.cuh>
+#include <raft/linalg/eig.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
@@ -83,15 +83,15 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    eigSelDC(handle,
-             cov_matrix.data(),
-             params.n_row,
-             params.n_col,
-             3,
-             eig_vectors.data(),
-             eig_vals.data(),
-             EigVecMemUsage::OVERWRITE_INPUT,
-             stream);
+    raft::linalg::eigSelDC(handle,
+                           cov_matrix.data(),
+                           params.n_row,
+                           params.n_col,
+                           3,
+                           eig_vectors.data(),
+                           eig_vals.data(),
+                           EigVecMemUsage::OVERWRITE_INPUT,
+                           stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 56e091837a..146d48e179 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index d539fe9a69..72567ff5f9 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemm.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index f32d4cf809..ea84e06675 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemv.h>
+#include <raft/linalg/gemv.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 74b812a63b..d27fad4dfc 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/map.cuh>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/map.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 5b60b69d36..9875e2548f 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/map_then_reduce.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 531bf370d4..9ab005a075 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 7bf797a4dc..ec0599eb1b 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -18,7 +18,7 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/multiply.cuh>
+#include <raft/linalg/multiply.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 6415d4dca9..56e111d056 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index fa05397555..14f34f142d 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 5fb7ddbc7b..7840df2c0d 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -17,11 +17,9 @@
 #pragma once
 
 #include <cublas_v2.h>
-
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
-
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -69,7 +67,8 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t
   raft::linalg::unaryOp<OutType>(
     ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(
     handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   RAFT_CUBLAS_TRY(cublasDestroy(handle));
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index f2aa02d9d9..6d33fbdef1 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -18,7 +18,7 @@
 #include "reduce.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/strided_reduction.cuh>
+#include <raft/linalg/strided_reduction.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 9c62eeb9f1..7b82dab1ad 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/subtract.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 47895cbc6a..e9128bad93 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/svd.cuh>
+#include <raft/linalg/svd.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 6aa83fc074..60db1ee82b 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 1b132955f5..050fed78ea 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -18,7 +18,7 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 8bb2d1e0be..b47e60d4f6 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 1cd00b8adc..cd0d065ad4 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <raft/common/nvtx.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index c9b15737a1..648964fc57 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -26,8 +26,8 @@
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/selection/knn_graph.hpp>
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index dc136d6f18..d3f4adb01b 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -19,8 +19,8 @@
 #include <cusparse_v2.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index f4f346561c..c8798f832f 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <raft/sparse/distance/distance.hpp>
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index bcfa796931..f0336a31fa 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 
 #include "../test_utils.h"
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/selection/knn.hpp>
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 81e6dc4768..cb09b9e7f5 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 257950e4d7..66cd11be1f 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include "spatial_data.h"
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
index a626824621..e635619897 100644
--- a/cpp/test/spatial/faiss_mr.cu
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 
 #include <faiss/gpu/GpuResources.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index d30c018ed2..3254d41401 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -19,7 +19,7 @@
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/StandardGpuResources.h>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index f60ec54bbc..6b7402a7bd 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 8af1505bcd..ee216ee434 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <raft/spatial/knn/knn.hpp>
 #if defined RAFT_NN_COMPILED
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 55e72656da..fd656423ad 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/stats/sum.hpp>