diff --git a/CHANGELOG.md b/CHANGELOG.md
index 58477e868f..0e376f5f76 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # RAFT 0.15.0 (Date TBD)
 
 ## New Features
+- PR #12: Spectral clustering.
 - PR #7: Migrating cuml comms -> raft comms_t
 - PR #15: add exception based error handling macros
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7bff25da45..b1a2284285 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -204,6 +204,7 @@ set(RAFT_LINK_LIBRARIES
   ${CUDA_cusolver_LIBRARY}
   ${CUDA_CUDART_LIBRARY}
   ${CUDA_cusparse_LIBRARY}
+  ${CUDA_curand_LIBRARY}
   rmm)
 
 set(RAFT_LINK_DIRECTORIES "${RMM_LIBRARY}")
@@ -224,7 +225,10 @@ if(BUILD_RAFT_TESTS)
     test/handle.cpp
     test/mr/device/buffer.cpp
     test/mr/host/buffer.cpp
-    test/test.cpp)
+    test/test.cpp
+    test/spectral_matrix.cu
+    test/eigen_solvers.cu
+    test/cluster_solvers.cu)
 
   target_include_directories(test_raft
     PRIVATE
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 7e8a52196a..40caff228e 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -578,5 +578,51 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
 }
 /** @} */
 
+/**
+ * @defgroup setpointermode cublas set pointer mode method
+ * @{
+ */
+// no T dependency...
+// template <typename T>
+// cublasStatus_t cublassetpointermode(  // NOLINT
+//                                         cublasHandle_t  handle,
+//                                         cublasPointerMode_t mode,
+//                                         cudaStream_t stream);
+
+// template<>
+inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
+                                           cublasPointerMode_t mode,
+                                           cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSetPointerMode(handle, mode);
+}
+/** @} */
+
+/**
+ * @defgroup scal cublas dot calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x,
+                          int incx, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
+                                 const float *alpha, float *x, int incx,
+                                 cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSscal(handle, n, alpha, x, incx);
+}
+
+template <>
+inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
+                                 const double *alpha, double *x, int incx,
+                                 cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDscal(handle, n, alpha, x, incx);
+}
+
+/** @} */
+
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
new file mode 100644
index 0000000000..b775a1f696
--- /dev/null
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -0,0 +1,1231 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+//for cmath:
+#define _USE_MATH_DEFINES
+
+#include <cmath>
+#include <vector>
+
+#include <cuda.h>
+#include <curand.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/handle.hpp>
+#include <raft/spectral/lapack.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/warn_dbg.hpp>
+
+namespace raft {
+
+using namespace matrix;
+using namespace linalg;
+
+namespace spectral {
+
+// curandGeneratorNormalX
+inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
+                                            float *outputPtr, size_t n,
+                                            float mean, float stddev) {
+  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+}
+inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
+                                            double *outputPtr, size_t n,
+                                            double mean, double stddev) {
+  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
+}
+
+// =========================================================
+// Helper functions
+// =========================================================
+
+/**  
+ *  @brief  Perform Lanczos iteration
+ *    Lanczos iteration is performed on a shifted matrix A+shift*I.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param iter Pointer to current Lanczos iteration. On exit, the
+ *    variable is set equal to the final Lanczos iteration.
+ *  @param maxIter Maximum Lanczos iteration. This function will
+ *    perform a maximum of maxIter-*iter iterations.
+ *  @param shift Matrix shift.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm (i.e. entry in beta_host) is
+ *    less than tol.
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param alpha_host (Output, host memory, maxIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, maxIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Input/output, device memory,
+ *    n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as
+ *    columns of a column-major matrix with dimensions
+ *    n x (maxIter+1).
+ *  @param work_dev (Output, device memory, maxIter entries)
+ *    Workspace. Not needed if full reorthogonalization is disabled.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t>
+int performLanczosIteration(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
+  index_type_t *iter, index_type_t maxIter, value_type_t shift,
+  value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host,
+  value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev) {
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Useful variables
+  constexpr value_type_t one = 1;
+  constexpr value_type_t negOne = -1;
+  constexpr value_type_t zero = 0;
+  value_type_t alpha;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
+
+  index_type_t n = A->nrows_;
+
+  // -------------------------------------------------------
+  // Compute second Lanczos vector
+  // -------------------------------------------------------
+  if (*iter <= 0) {
+    *iter = 1;
+
+    // Apply matrix
+    if (shift != 0)
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev,
+                               n * sizeof(value_type_t),
+                               cudaMemcpyDeviceToDevice, stream));
+    A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
+
+    // Orthogonalize Lanczos vector
+    CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1,
+                           lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host,
+                           stream));
+
+    alpha = -alpha_host[0];
+    CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1,
+                            lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1,
+                            beta_host, stream));
+
+    // Check if Lanczos has converged
+    if (beta_host[0] <= tol) return 0;
+
+    // Normalize Lanczos vector
+    alpha = 1 / beta_host[0];
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n),
+                            1, stream));
+  }
+
+  // -------------------------------------------------------
+  // Compute remaining Lanczos vectors
+  // -------------------------------------------------------
+
+  while (*iter < maxIter) {
+    ++(*iter);
+
+    // Apply matrix
+    if (shift != 0)
+      CUDA_TRY(cudaMemcpyAsync(
+        lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n,
+        n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift,
+          lanczosVecs_dev + IDX(0, *iter, n));
+
+    // Full reorthogonalization
+    //   "Twice is enough" algorithm per Kahan and Parlett
+    if (reorthogonalize) {
+      CUBLAS_CHECK(cublasgemv(
+        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
+        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
+                              lanczosVecs_dev, n, work_dev, 1, &one,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+
+      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1),
+                               sizeof(value_type_t), cudaMemcpyDeviceToHost,
+                               stream));
+
+      CUBLAS_CHECK(cublasgemv(
+        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
+        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
+                              lanczosVecs_dev, n, work_dev, 1, &one,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    }
+
+    // Orthogonalization with 3-term recurrence relation
+    else {
+      CUBLAS_CHECK(cublasdot(cublas_h, n,
+                             lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
+                             lanczosVecs_dev + IDX(0, *iter, n), 1,
+                             alpha_host + (*iter - 1), stream));
+
+      auto alpha = -alpha_host[*iter - 1];
+      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+
+      alpha = -beta_host[*iter - 2];
+      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 2, n), 1,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    }
+
+    // Compute residual
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1,
+                            beta_host + *iter - 1, stream));
+
+    // Check if Lanczos has converged
+    if (beta_host[*iter - 1] <= tol) break;
+
+    // Normalize Lanczos vector
+    alpha = 1 / beta_host[*iter - 1];
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha,
+                            lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+  }
+
+  CUDA_TRY(cudaStreamSynchronize(stream));
+
+  return 0;
+}
+
+/** 
+ *  @brief  Find Householder transform for 3-dimensional system
+ *    Given an input vector v=[x,y,z]', this function finds a
+ *    Householder transform P such that P*v is a multiple of
+ *    e_1=[1,0,0]'. The input vector v is overwritten with the
+ *    Householder vector such that P=I-2*v*v'.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param v (Input/output, host memory, 3 entries) Input
+ *    3-dimensional vector. On exit, the vector is set to the
+ *    Householder vector.
+ *  @param Pv (Output, host memory, 1 entry) First entry of P*v
+ *    (here v is the input vector). Either equal to ||v||_2 or
+ *    -||v||_2.
+ *  @param P (Output, host memory, 9 entries) Householder transform
+ *    matrix. Matrix dimensions are 3 x 3.
+ */
+template <typename index_type_t, typename value_type_t>
+static void findHouseholder3(value_type_t *v, value_type_t *Pv,
+                             value_type_t *P) {
+  // Compute norm of vector
+  *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+
+  // Choose whether to reflect to e_1 or -e_1
+  //   This choice avoids catastrophic cancellation
+  if (v[0] >= 0) *Pv = -(*Pv);
+  v[0] -= *Pv;
+
+  // Normalize Householder vector
+  value_type_t normHouseholder =
+    std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  if (normHouseholder != 0) {
+    v[0] /= normHouseholder;
+    v[1] /= normHouseholder;
+    v[2] /= normHouseholder;
+  } else {
+    v[0] = 0;
+    v[1] = 0;
+    v[2] = 0;
+  }
+
+  // Construct Householder matrix
+  index_type_t i, j;
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1;
+}
+
+/**  
+ *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
+ *    The Householder transform is pre-applied to the top three rows
+ *  of the matrix and post-applied to the left three columns. The
+ *  4 x 4 matrix is intended to contain the bulge that is produced
+ *  in the Francis QR algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param v (Input, host memory, 3 entries) Householder vector.
+ *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
+ */
+template <typename index_type_t, typename value_type_t>
+static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
+  // Loop indices
+  index_type_t i, j;
+  // Dot product between Householder vector and matrix row/column
+  value_type_t vDotA;
+
+  // Pre-apply Householder transform
+  for (j = 0; j < 4; ++j) {
+    vDotA = 0;
+    for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+  }
+
+  // Post-apply Householder transform
+  for (i = 0; i < 4; ++i) {
+    vDotA = 0;
+    for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+  }
+}
+
+/**  
+ *  @brief  Perform one step of Francis QR algorithm
+ *    Equivalent to two steps of the classical QR algorithm on a
+ *    tridiagonal matrix.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Matrix dimension.
+ *  @param shift1 QR algorithm shift.
+ *  @param shift2 QR algorithm shift.
+ *  @param alpha (Input/output, host memory, n entries) Diagonal
+ *    entries of tridiagonal matrix.
+ *  @param beta (Input/output, host memory, n-1 entries)
+ *    Off-diagonal entries of tridiagonal matrix.
+ *  @param V (Input/output, host memory, n*n entries) Orthonormal
+ *    transforms from previous steps of QR algorithm. Matrix
+ *    dimensions are n x n. On exit, the orthonormal transform from
+ *    this Francis QR step is post-applied to the matrix.
+ *  @param work (Output, host memory, 3*n entries) Workspace.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t>
+static int francisQRIteration(index_type_t n, value_type_t shift1,
+                              value_type_t shift2, value_type_t *alpha,
+                              value_type_t *beta, value_type_t *V,
+                              value_type_t *work) {
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Temporary storage of 4x4 bulge and Householder vector
+  value_type_t bulge[16];
+
+  // Householder vector
+  value_type_t householder[3];
+  // Householder matrix
+  value_type_t householderMatrix[3 * 3];
+
+  // Shifts are roots of the polynomial p(x)=x^2+b*x+c
+  value_type_t b = -shift1 - shift2;
+  value_type_t c = shift1 * shift2;
+
+  // Loop indices
+  index_type_t i, j, pos;
+  // Temporary variable
+  value_type_t temp;
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Compute initial Householder transform
+  householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
+  householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
+  householder[2] = beta[0] * beta[1];
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp,
+                                               householderMatrix);
+
+  // Apply initial Householder transform to create bulge
+  memset(bulge, 0, 16 * sizeof(value_type_t));
+  for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 3; ++i) {
+    bulge[IDX(i + 1, i, 4)] = beta[i];
+    bulge[IDX(i, i + 1, 4)] = beta[i];
+  }
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix,
+                             3, 0, work, n);
+  memcpy(V, work, 3 * n * sizeof(value_type_t));
+
+  // Chase bulge to bottom-right of matrix with Householder transforms
+  for (pos = 0; pos < n - 4; ++pos) {
+    // Move to next position
+    alpha[pos] = bulge[IDX(0, 0, 4)];
+    householder[0] = bulge[IDX(1, 0, 4)];
+    householder[1] = bulge[IDX(2, 0, 4)];
+    householder[2] = bulge[IDX(3, 0, 4)];
+    for (j = 0; j < 3; ++j)
+      for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    bulge[IDX(3, 0, 4)] = 0;
+    bulge[IDX(3, 1, 4)] = 0;
+    bulge[IDX(3, 2, 4)] = beta[pos + 3];
+    bulge[IDX(0, 3, 4)] = 0;
+    bulge[IDX(1, 3, 4)] = 0;
+    bulge[IDX(2, 3, 4)] = beta[pos + 3];
+    bulge[IDX(3, 3, 4)] = alpha[pos + 4];
+
+    // Apply Householder transform
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos,
+                                                 householderMatrix);
+    applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+    Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n),
+                               n, householderMatrix, 3, 0, work, n);
+    memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
+  }
+
+  // Apply penultimate Householder transform
+  //   Values in the last row and column are zero
+  alpha[n - 4] = bulge[IDX(0, 0, 4)];
+  householder[0] = bulge[IDX(1, 0, 4)];
+  householder[1] = bulge[IDX(2, 0, 4)];
+  householder[2] = bulge[IDX(3, 0, 4)];
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  bulge[IDX(3, 0, 4)] = 0;
+  bulge[IDX(3, 1, 4)] = 0;
+  bulge[IDX(3, 2, 4)] = 0;
+  bulge[IDX(0, 3, 4)] = 0;
+  bulge[IDX(1, 3, 4)] = 0;
+  bulge[IDX(2, 3, 4)] = 0;
+  bulge[IDX(3, 3, 4)] = 0;
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4,
+                                               householderMatrix);
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n,
+                             householderMatrix, 3, 0, work, n);
+  memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
+
+  // Apply final Householder transform
+  //   Values in the last two rows and columns are zero
+  alpha[n - 3] = bulge[IDX(0, 0, 4)];
+  householder[0] = bulge[IDX(1, 0, 4)];
+  householder[1] = bulge[IDX(2, 0, 4)];
+  householder[2] = 0;
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3,
+                                               householderMatrix);
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n,
+                             householderMatrix, 3, 0, work, n);
+  memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
+
+  // Bulge has been eliminated
+  alpha[n - 2] = bulge[IDX(0, 0, 4)];
+  alpha[n - 1] = bulge[IDX(1, 1, 4)];
+  beta[n - 2] = bulge[IDX(1, 0, 4)];
+
+  return 0;
+}
+
+/**  
+ *  @brief  Perform implicit restart of Lanczos algorithm
+ *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Matrix dimension.
+ *  @param iter Current Lanczos iteration.
+ *  @param iter_new Lanczos iteration after restart.
+ *  @param shiftUpper Pointer (host memory) to upper bound for unwanted
+ *    region. Value is ignored if less than *shiftLower. If a
+ *    stronger upper bound has been found, the value is updated on
+ *    exit.
+ *  @param shiftLower Pointer (host memory) to lower bound for unwanted
+ *    region. Value is ignored if greater than *shiftUpper. If a
+ *    stronger lower bound has been found, the value is updated on
+ *    exit.
+ *  @param alpha_host (Input/output, host memory, iter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Input/output, host memory, iter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param V_host (Output, host memory, iter*iter entries)
+ *    Orthonormal transform used to obtain restarted system. Matrix
+ *    dimensions are iter x iter.
+ *  @param work_host (Output, host memory, 4*iter entries)
+ *    Workspace.
+ *  @param lanczosVecs_dev (Input/output, device memory, n*(iter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (iter+1).
+ *  @param work_dev (Output, device memory, (n+iter)*iter entries)
+ *    Workspace.
+ *  @param smallest_eig specifies whether smallest (true) or largest
+ *    (false) eigenvalues are to be calculated.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+static int lanczosRestart(
+  handle_t const &handle, index_type_t n, index_type_t iter,
+  index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower,
+  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev, bool smallest_eig) {
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Useful constants
+  constexpr value_type_t zero = 0;
+  constexpr value_type_t one = 1;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // Loop index
+  index_type_t i;
+
+  // Number of implicit restart steps
+  //   Assumed to be even since each call to Francis algorithm is
+  //   equivalent to two calls of QR algorithm
+  index_type_t restartSteps = iter - iter_new;
+
+  // Ritz values from Lanczos method
+  value_type_t *ritzVals_host = work_host + 3 * iter;
+  // Shifts for implicit restart
+  value_type_t *shifts_host;
+
+  // Orthonormal matrix for similarity transform
+  value_type_t *V_dev = work_dev + n * iter;
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Compute Ritz values
+  memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t));
+  memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::sterf(iter, ritzVals_host, work_host);
+
+  // Debug: Print largest eigenvalues
+  // for (int i = iter-iter_new; i < iter; ++i)
+  //  std::cout <<*(ritzVals_host+i)<< " ";
+  // std::cout <<std::endl;
+
+  // Initialize similarity transform with identity matrix
+  memset(V_host, 0, iter * iter * sizeof(value_type_t));
+  for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1;
+
+  // Determine interval to suppress eigenvalues
+  if (smallest_eig) {
+    if (*shiftLower > *shiftUpper) {
+      *shiftUpper = ritzVals_host[iter - 1];
+      *shiftLower = ritzVals_host[iter_new];
+    } else {
+      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]);
+      *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]);
+    }
+  } else {
+    if (*shiftLower > *shiftUpper) {
+      *shiftUpper = ritzVals_host[iter - iter_new - 1];
+      *shiftLower = ritzVals_host[0];
+    } else {
+      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]);
+      *shiftLower = std::min(*shiftLower, ritzVals_host[0]);
+    }
+  }
+
+  // Calculate Chebyshev nodes as shifts
+  shifts_host = ritzVals_host;
+  for (i = 0; i < restartSteps; ++i) {
+    shifts_host[i] =
+      cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
+    shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
+  }
+
+  // Apply Francis QR algorithm to implicitly restart Lanczos
+  for (i = 0; i < restartSteps; i += 2)
+    if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host,
+                           beta_host, V_host, work_host))
+      WARNING("error in implicitly shifted QR algorithm");
+
+  // Obtain new residual
+  CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] =
+    beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  CUBLAS_CHECK(cublasgemv(
+    cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev,
+    n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1,
+    lanczosVecs_dev + IDX(0, iter, n), 1, stream));
+
+  // Obtain new Lanczos vectors
+  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter,
+                          &one, lanczosVecs_dev, n, V_dev, iter, &zero,
+                          work_dev, n, stream));
+
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
+                           n * iter_new * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice, stream));
+
+  // Normalize residual to obtain new Lanczos vector
+  CUDA_TRY(cudaMemcpyAsync(
+    lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n),
+    n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+
+  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1,
+                          beta_host + iter_new - 1, stream));
+
+  auto h_beta = 1 / beta_host[iter_new - 1];
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta,
+                          lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+
+  return 0;
+}
+
+}  // namespace spectral
+
+// =========================================================
+// Eigensolver
+// =========================================================
+
+/**  
+ * @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param shift On exit, pointer to matrix shift (estimate for
+ *    largest eigenvalue).
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
+  index_type_t *totalIter, value_type_t *shift,
+  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+  using namespace spectral;
+
+  // Useful constants
+  constexpr value_type_t one = 1;
+  constexpr value_type_t zero = 0;
+
+  // Matrix dimension
+  index_type_t n = A->nrows_;
+
+  // Shift for implicit restart
+  value_type_t shiftUpper;
+  value_type_t shiftLower;
+
+  // Lanczos iteration counters
+  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+  // Status flags
+  int status;
+
+  // Loop index
+  index_type_t i;
+
+  // Host memory
+  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t *work_host;  // Workspace
+
+  // -------------------------------------------------------
+  // Check that parameters are valid
+  // -------------------------------------------------------
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // -------------------------------------------------------
+  // Variable initialization
+  // -------------------------------------------------------
+
+  // Total number of Lanczos iterations
+  *totalIter = 0;
+
+  // Allocate host memory
+  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
+  std::vector<value_type_t> work_host_v(4 * restartIter);
+
+  Z_host = Z_host_v.data();
+  work_host = work_host_v.data();
+
+  // Initialize cuBLAS
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // Compute largest eigenvalue to determine shift
+  // -------------------------------------------------------
+
+  // Random number generator
+  curandGenerator_t randGen;
+  // Initialize random number generator
+  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
+
+  curandSetPseudoRandomGeneratorSeed(randGen, seed);
+
+  // Initialize initial Lanczos vector
+  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
+  value_type_t normQ1;
+  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+
+  auto h_val = 1 / normQ1;
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+  *shift = 0;
+  status = performLanczosIteration<index_type_t, value_type_t>(
+    handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host,
+    beta_host, lanczosVecs_dev, work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+
+  // Determine largest eigenvalue
+
+  Lapack<value_type_t>::sterf(*effIter, alpha_host, beta_host);
+  *shift = -alpha_host[*effIter - 1];
+
+  // -------------------------------------------------------
+  // Compute eigenvectors of shifted matrix
+  // -------------------------------------------------------
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(
+    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
+    beta_host, lanczosVecs_dev, work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+  *totalIter += *effIter;
+
+  // Apply Lanczos method until convergence
+  shiftLower = 1;
+  shiftUpper = -1;
+  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
+    // Determine number of restart steps
+    // Number of steps must be even due to Francis algorithm
+    index_type_t iter_new = nEigVecs + 1;
+    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
+      iter_new = restartIter - (maxIter - *totalIter);
+    if ((restartIter - iter_new) % 2) iter_new -= 1;
+    if (iter_new == *effIter) break;
+
+    // Implicit restart of Lanczos method
+    status = lanczosRestart<index_type_t, value_type_t>(
+      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
+      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true);
+    if (status) WARNING("error in Lanczos implicit restart");
+    *effIter = iter_new;
+
+    // Check for convergence
+    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
+
+    // Proceed with Lanczos method
+
+    status = performLanczosIteration<index_type_t, value_type_t>(
+      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
+      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    if (status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter - iter_new;
+  }
+
+  // Warning if Lanczos has failed to converge
+  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
+    WARNING("implicitly restarted Lanczos failed to converge");
+  }
+
+  // Solve tridiagonal system
+  memcpy(work_host + 2 * (*effIter), alpha_host,
+         (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host,
+         (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter), Z_host, *effIter,
+                              work_host);
+
+  // Obtain desired eigenvalues by applying shift
+  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0;
+
+  // Copy results to device memory
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter),
+                           nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice, stream));
+
+  CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host,
+                           (*effIter) * nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice, stream));
+  CHECK_CUDA(stream);
+
+  // Convert eigenvectors from Lanczos basis to standard basis
+  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
+                          *effIter, &one, lanczosVecs_dev, n, work_dev,
+                          *effIter, &zero, eigVecs_dev, n, stream));
+
+  // Clean up and exit
+  curandDestroyGenerator(randGen);
+  return 0;
+}
+
+/**  
+ *  @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param iter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Smallest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t &iter,
+  value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
+  using namespace spectral;
+
+  // Matrix dimension
+  index_type_t n = A.nrows_;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  // Allocate memory
+  std::vector<value_type_t> alpha_host_v(restartIter);
+  std::vector<value_type_t> beta_host_v(restartIter);
+
+  value_type_t *alpha_host = alpha_host_v.data();
+  value_type_t *beta_host = beta_host_v.data();
+
+  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+
+  // Perform Lanczos method
+  index_type_t effIter;
+  value_type_t shift;
+  int status = computeSmallestEigenvectors(
+    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
+    &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
+    eigVals_dev, eigVecs_dev, seed);
+
+  // Clean up and return
+  return status;
+}
+
+// =========================================================
+// Eigensolver
+// =========================================================
+
+/**  
+ *  @brief Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed.
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
+  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
+  value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+  using namespace spectral;
+
+  // Useful constants
+  constexpr value_type_t one = 1;
+  constexpr value_type_t zero = 0;
+
+  // Matrix dimension
+  index_type_t n = A->nrows_;
+
+  // Lanczos iteration counters
+  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+  // Status flags
+  int status;
+
+  // Loop index
+  index_type_t i;
+
+  // Host memory
+  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t *work_host;  // Workspace
+
+  // -------------------------------------------------------
+  // Check that LAPACK is enabled
+  // -------------------------------------------------------
+  // Lapack<value_type_t>::check_lapack_enabled();
+
+  // -------------------------------------------------------
+  // Check that parameters are valid
+  // -------------------------------------------------------
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // -------------------------------------------------------
+  // Variable initialization
+  // -------------------------------------------------------
+
+  // Total number of Lanczos iterations
+  *totalIter = 0;
+
+  // Allocate host memory
+  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
+  std::vector<value_type_t> work_host_v(4 * restartIter);
+
+  Z_host = Z_host_v.data();
+  work_host = work_host_v.data();
+
+  // Initialize cuBLAS
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // Compute largest eigenvalue
+  // -------------------------------------------------------
+
+  // Random number generator
+  curandGenerator_t randGen;
+  // Initialize random number generator
+  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
+  curandSetPseudoRandomGeneratorSeed(randGen, seed);
+  // Initialize initial Lanczos vector
+  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
+  value_type_t normQ1;
+  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+
+  auto h_val = 1 / normQ1;
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+  value_type_t shift_val = 0.0;
+  value_type_t *shift = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(
+    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
+    beta_host, lanczosVecs_dev, work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+  *totalIter += *effIter;
+
+  // Apply Lanczos method until convergence
+  value_type_t shiftLower = 1;
+  value_type_t shiftUpper = -1;
+  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
+    // Determine number of restart steps
+    //   Number of steps must be even due to Francis algorithm
+    index_type_t iter_new = nEigVecs + 1;
+    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
+      iter_new = restartIter - (maxIter - *totalIter);
+    if ((restartIter - iter_new) % 2) iter_new -= 1;
+    if (iter_new == *effIter) break;
+
+    // Implicit restart of Lanczos method
+    status = lanczosRestart<index_type_t, value_type_t>(
+      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
+      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false);
+    if (status) WARNING("error in Lanczos implicit restart");
+    *effIter = iter_new;
+
+    // Check for convergence
+    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
+
+    // Proceed with Lanczos method
+
+    status = performLanczosIteration<index_type_t, value_type_t>(
+      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
+      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    if (status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter - iter_new;
+  }
+
+  // Warning if Lanczos has failed to converge
+  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
+    WARNING("implicitly restarted Lanczos failed to converge");
+  }
+  for (int i = 0; i < restartIter; ++i) {
+    for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0;
+  }
+  // Solve tridiagonal system
+  memcpy(work_host + 2 * (*effIter), alpha_host,
+         (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host,
+         (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter), Z_host, *effIter,
+                              work_host);
+
+  // note: We need to pick the top nEigVecs eigenvalues
+  // but effItter can be larger than nEigVecs
+  // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the
+  // matrix of size effIter. remember the array is sorted, so it is not needed for smallest
+  // eigenvalues case because the first ones are the smallest ones
+
+  index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs;
+
+  // Debug : print nEigVecs largest eigenvalues
+  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+  //  std::cout <<*(work_host+(2*(*effIter)+i))<< " ";
+  // std::cout <<std::endl;
+
+  // Debug : print nEigVecs largest eigenvectors
+  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+  //{
+  //  for (int j = 0; j < *effIter; ++j)
+  //    std::cout <<Z_host[i*(*effIter)+j]<< " ";
+  //  std::cout <<std::endl;
+  //}
+
+  // Obtain desired eigenvalues by applying shift
+  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+
+  for (i = 0; i < top_eigenparis_idx_offset; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
+
+  // Copy results to device memory
+  // skip smallest eigenvalue if needed
+  CUDA_TRY(cudaMemcpyAsync(
+    eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+    nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  // skip smallest eigenvector if needed
+  CUDA_TRY(cudaMemcpyAsync(work_dev,
+                           Z_host + (top_eigenparis_idx_offset * (*effIter)),
+                           (*effIter) * nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice, stream));
+
+  CHECK_CUDA(stream);
+
+  // Convert eigenvectors from Lanczos basis to standard basis
+  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
+                          *effIter, &one, lanczosVecs_dev, n, work_dev,
+                          *effIter, &zero, eigVecs_dev, n, stream));
+
+  // Clean up and exit
+  curandDestroyGenerator(randGen);
+  return 0;
+}
+
+/**  
+ *  @brief  Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param iter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t &iter,
+  value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
+  // Matrix dimension
+  index_type_t n = A.nrows_;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  // Allocate memory
+  std::vector<value_type_t> alpha_host_v(restartIter);
+  std::vector<value_type_t> beta_host_v(restartIter);
+
+  value_type_t *alpha_host = alpha_host_v.data();
+  value_type_t *beta_host = beta_host_v.data();
+
+  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+
+  // Perform Lanczos method
+  index_type_t effIter;
+  int status = computeLargestEigenvectors(
+    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
+    &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
+    eigVals_dev, eigVecs_dev, seed);
+
+  // Clean up and return
+  return status;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 9de242ea10..9e8606df3e 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -182,21 +182,337 @@ inline void cusparsecoosortByRow(  // NOLINT
  * @defgroup Gemmi cusparse gemmi operations
  * @{
  */
-inline cusparseStatus_t cusparsegemmi(
-  cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha,
-  const float* A, int lda, const float* cscValB, const int* cscColPtrB,
-  const int* cscRowIndB, const float* beta, float* C, int ldc) {
+template <typename T>
+cusparseStatus_t cusparsegemmi(  // NOLINT
+  cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha,
+  const T* A, int lda, const T* cscValB, const int* cscColPtrB,
+  const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
+                                      int k, int nnz, const float* alpha,
+                                      const float* A, int lda,
+                                      const float* cscValB,
+                                      const int* cscColPtrB,
+                                      const int* cscRowIndB, const float* beta,
+                                      float* C, int ldc, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
                         cscColPtrB, cscRowIndB, beta, C, ldc);
 }
-inline cusparseStatus_t cusparsegemmi(
-  cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha,
-  const double* A, int lda, const double* cscValB, const int* cscColPtrB,
-  const int* cscRowIndB, const double* beta, double* C, int ldc) {
+template <>
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
+                                      int k, int nnz, const double* alpha,
+                                      const double* A, int lda,
+                                      const double* cscValB,
+                                      const int* cscColPtrB,
+                                      const int* cscRowIndB, const double* beta,
+                                      double* C, int ldc, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
                         cscColPtrB, cscRowIndB, beta, C, ldc);
 }
 /** @} */
 
+#if __CUDACC_VER_MAJOR__ > 10
+/**
+ * @defgroup cusparse Create CSR operations
+ * @{
+ */
+template <typename IndexT, typename ValueT>
+cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
+                                   int64_t rows, int64_t cols, int64_t nnz,
+                                   IndexT* csrRowOffsets, IndexT* csrColInd,
+                                   ValueT* csrValues);
+template <>
+inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int32_t* csrRowOffsets,
+                                          int32_t* csrColInd,
+                                          float* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                           CUDA_R_32F);
+}
+template <>
+inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int32_t* csrRowOffsets,
+                                          int32_t* csrColInd,
+                                          double* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                           CUDA_R_64F);
+}
+template <>
+inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t* csrColInd,
+                                          float* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                           CUDA_R_32F);
+}
+template <>
+inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t* csrColInd,
+                                          double* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                           CUDA_R_64F);
+}
+/** @} */
+/**
+ * @defgroup cusparse CreateDnVec operations
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
+                                     int64_t size, T* values);
+template <>
+inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
+                                            int64_t size, float* values) {
+  return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F);
+}
+template <>
+inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
+                                            int64_t size, double* values) {
+  return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F);
+}
+/** @} */
+
+/**
+ * @defgroup Csrmv cusparse SpMV operations
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsespmv_buffersize(
+  cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  size_t* bufferSize, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsespmv_buffersize(
+  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  size_t* bufferSize, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
+                                 CUDA_R_32F, alg, bufferSize);
+}
+template <>
+inline cusparseStatus_t cusparsespmv_buffersize(
+  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  size_t* bufferSize, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
+                                 CUDA_R_64F, alg, bufferSize);
+}
+
+template <typename T>
+cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA,
+                              const T* alpha, const cusparseSpMatDescr_t matA,
+                              const cusparseDnVecDescr_t vecX, const T* beta,
+                              const cusparseDnVecDescr_t vecY,
+                              cusparseSpMVAlg_t alg, T* externalBuffer,
+                              cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsespmv(
+  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  float* externalBuffer, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F,
+                      alg, externalBuffer);
+}
+template <>
+inline cusparseStatus_t cusparsespmv(
+  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  double* externalBuffer, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F,
+                      alg, externalBuffer);
+}
+/** @} */
+#else
+/**
+ * @defgroup Csrmv cusparse csrmv operations
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsecsrmv(  // NOLINT
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
+  const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y,
+  cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmv(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
+  const float* alpha, const cusparseMatDescr_t descr, const float* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta,
+  float* y, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, beta, y);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmv(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
+  const double* alpha, const cusparseMatDescr_t descr, const double* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const double* x,
+  const double* beta, double* y, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, beta, y);
+}
+/** @} */
+#endif
+
+#if __CUDACC_VER_MAJOR__ > 10
+/**
+ * @defgroup Csrmm cusparse csrmm operations
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsespmm_bufferSize(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const T* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsespmm_bufferSize(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const float* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
+                                 matC, CUDA_R_32F, alg, bufferSize);
+}
+template <>
+inline cusparseStatus_t cusparsespmm_bufferSize(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const double* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const double* beta,
+  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
+                                 matC, CUDA_R_64F, alg, bufferSize);
+}
+template <typename T>
+inline cusparseStatus_t cusparsespmm(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const T* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsespmm(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const float* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
+                      CUDA_R_32F, alg, externalBuffer);
+}
+template <>
+inline cusparseStatus_t cusparsespmm(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const double* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const double* beta,
+  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
+                      CUDA_R_64F, alg, externalBuffer);
+}
+/** @} */
+#else
+/**
+ * @defgroup Csrmm cusparse csrmm operations
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsecsrmm(  // NOLINT
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
+  int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx,
+  const T* beta, T* y, const int ldy, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmm(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
+  int nnz, const float* alpha, const cusparseMatDescr_t descr,
+  const float* csrVal, const int* csrRowPtr, const int* csrColInd,
+  const float* x, const int ldx, const float* beta, float* y, const int ldy,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmm(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
+  int nnz, const double* alpha, const cusparseMatDescr_t descr,
+  const double* csrVal, const int* csrRowPtr, const int* csrColInd,
+  const double* x, const int ldx, const double* beta, double* y, const int ldy,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+}
+/** @} */
+#endif
+
+/**
+ * @defgroup csr2coo cusparse CSR to COO converter methods
+ * @{
+ */
+template <typename T>
+void cusparsecsr2coo(  // NOLINT
+  cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr,
+  T* cooRowInd, cudaStream_t stream);
+template <>
+inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
+                            const int* csrRowPtr, int* cooRowInd,
+                            cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd,
+                                  CUSPARSE_INDEX_BASE_ZERO));
+}
+/** @} */
+
+/**
+ * @defgroup setpointermode cusparse set pointer mode method
+ * @{
+ */
+// no T dependency...
+// template <typename T>
+// cusparseStatus_t cusparsesetpointermode(  // NOLINT
+//                                         cusparseHandle_t handle,
+//                                         cusparsePointerMode_t mode,
+//                                         cudaStream_t stream);
+
+// template<>
+inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
+                                               cusparsePointerMode_t mode,
+                                               cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSetPointerMode(handle, mode);
+}
+/** @} */
+
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
new file mode 100644
index 0000000000..922ae7cfab
--- /dev/null
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/spectral/kmeans.hpp>
+#include <utility>  // for std::pair
+
+namespace raft {
+
+using namespace matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
+struct cluster_solver_config_t {
+  size_type_t n_clusters;
+  size_type_t maxIter;
+
+  value_type_t tol;
+
+  unsigned long long seed{123456};
+};
+
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
+struct kmeans_solver_t {
+  explicit kmeans_solver_t(cluster_solver_config_t<index_type_t, value_type_t,
+                                                   size_type_t> const& config)
+    : config_(config) {}
+
+  template <typename thrust_exe_policy_t>
+  std::pair<value_type_t, index_type_t> solve(
+    handle_t const& handle, thrust_exe_policy_t t_exe_policy,
+    size_type_t n_obs_vecs, size_type_t dim,
+    value_type_t const* __restrict__ obs,
+    index_type_t* __restrict__ codes) const {
+    RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
+    RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
+    value_type_t residual{};
+    index_type_t iters{};
+    kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters,
+           config_.tol, config_.maxIter, obs, codes, residual, iters,
+           config_.seed);
+    return std::make_pair(residual, iters);
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
new file mode 100644
index 0000000000..e36dca2e0c
--- /dev/null
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/linalg/lanczos.hpp>
+
+namespace raft {
+
+using namespace matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
+struct eigen_solver_config_t {
+  size_type_t n_eigVecs;
+  size_type_t maxIter;
+
+  size_type_t restartIter;
+  value_type_t tol;
+
+  bool reorthogonalize{false};
+  unsigned long long seed{
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations
+};
+
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
+struct lanczos_solver_t {
+  explicit lanczos_solver_t(eigen_solver_config_t<index_type_t, value_type_t,
+                                                  size_type_t> const& config)
+    : config_(config) {}
+
+  index_type_t solve_smallest_eigenvectors(
+    handle_t const& handle,
+    sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
+                                config_.restartIter, config_.tol,
+                                config_.reorthogonalize, iters, eigVals,
+                                eigVecs, config_.seed);
+    return iters;
+  }
+
+  index_type_t solve_largest_eigenvectors(
+    handle_t const& handle,
+    sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
+                               config_.restartIter, config_.tol,
+                               config_.reorthogonalize, iters, eigVals, eigVecs,
+                               config_.seed);
+    return iters;
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
new file mode 100644
index 0000000000..08913d41e7
--- /dev/null
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -0,0 +1,957 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cmath>
+#include <cstdio>
+#include <ctime>
+
+#include <cuda.h>
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/random.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/handle.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/warn_dbg.hpp>
+#include <raft/utils/sm_utils.hpp>
+
+namespace {
+
+using namespace raft;
+using namespace raft::linalg;
+// =========================================================
+// Useful grid settings
+// =========================================================
+
+constexpr unsigned int BLOCK_SIZE = 1024;
+constexpr unsigned int WARP_SIZE = 32;
+constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
+
+// =========================================================
+// CUDA kernels
+// =========================================================
+
+/** 
+ *  @brief Compute distances between observation vectors and centroids
+ *    Block dimensions should be (warpSize, 1,
+ *    blockSize/warpSize). Ideally, the grid is large enough so there
+ *    are d threads in the x-direction, k threads in the y-direction,
+ *    and n threads in the z-direction.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, d*n entries) Observation matrix. Matrix is
+ *    stored column-major and each column is an observation
+ *    vector. Matrix dimensions are d x n.
+ *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
+ *    stored column-major and each column is a centroid. Matrix
+ *    dimensions are d x k.
+ *  @param dists (Output, n*k entries) Distance matrix. Matrix is
+ *    stored column-major and the (i,j)-entry is the square of the
+ *    Euclidean distance between the ith observation vector and jth
+ *    centroid. Matrix dimensions are n x k. Entries must be
+ *    initialized to zero.
+ */
+template <typename index_type_t, typename value_type_t>
+static __global__ void computeDistances(
+  index_type_t n, index_type_t d, index_type_t k,
+  const value_type_t* __restrict__ obs,
+  const value_type_t* __restrict__ centroids,
+  value_type_t* __restrict__ dists) {
+  // Loop index
+  index_type_t i;
+
+  // Block indices
+  index_type_t bidx;
+  // Global indices
+  index_type_t gidx, gidy, gidz;
+
+  // Private memory
+  value_type_t centroid_private, dist_private;
+
+  // Global x-index indicates index of vector entry
+  bidx = blockIdx.x;
+  while (bidx * blockDim.x < d) {
+    gidx = threadIdx.x + bidx * blockDim.x;
+
+    // Global y-index indicates centroid
+    gidy = threadIdx.y + blockIdx.y * blockDim.y;
+    while (gidy < k) {
+      // Load centroid coordinate from global memory
+      centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0;
+
+      // Global z-index indicates observation vector
+      gidz = threadIdx.z + blockIdx.z * blockDim.z;
+      while (gidz < n) {
+        // Load observation vector coordinate from global memory
+        dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0;
+
+        // Compute contribution of current entry to distance
+        dist_private = centroid_private - dist_private;
+        dist_private = dist_private * dist_private;
+
+        // Perform reduction on warp
+        for (i = WARP_SIZE / 2; i > 0; i /= 2)
+          dist_private += utils::shfl_down(dist_private, i, 2 * i);
+
+        // Write result to global memory
+        if (threadIdx.x == 0)
+          utils::atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private);
+
+        // Move to another observation vector
+        gidz += blockDim.z * gridDim.z;
+      }
+
+      // Move to another centroid
+      gidy += blockDim.y * gridDim.y;
+    }
+
+    // Move to another vector entry
+    bidx += gridDim.x;
+  }
+}
+
+/** 
+ *  @brief Find closest centroid to observation vectors. 
+ *    Block and grid dimensions should be 1-dimensional. Ideally the
+ *    grid is large enough so there are n threads.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Number of observation vectors.
+ *  @param k Number of clusters.
+ *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
+ *    stored column-major and each column is a centroid. Matrix
+ *    dimensions are d x k.
+ *  @param dists (Input/output, n*k entries) Distance matrix. Matrix
+ *    is stored column-major and the (i,j)-entry is the square of
+ *    the Euclidean distance between the ith observation vector and
+ *    jth centroid. Matrix dimensions are n x k. On exit, the first
+ *    n entries give the square of the Euclidean distance between
+ *    observation vectors and closest centroids.
+ *  @param codes (Output, n entries) Cluster assignments.
+ *  @param clusterSizes (Output, k entries) Number of points in each
+ *    cluster. Entries must be initialized to zero.
+ */
+template <typename index_type_t, typename value_type_t>
+static __global__ void minDistances(index_type_t n, index_type_t k,
+                                    value_type_t* __restrict__ dists,
+                                    index_type_t* __restrict__ codes,
+                                    index_type_t* __restrict__ clusterSizes) {
+  // Loop index
+  index_type_t i, j;
+
+  // Current matrix entry
+  value_type_t dist_curr;
+
+  // Smallest entry in row
+  value_type_t dist_min;
+  index_type_t code_min;
+
+  // Each row in observation matrix is processed by a thread
+  i = threadIdx.x + blockIdx.x * blockDim.x;
+  while (i < n) {
+    // Find minimum entry in row
+    code_min = 0;
+    dist_min = dists[IDX(i, 0, n)];
+    for (j = 1; j < k; ++j) {
+      dist_curr = dists[IDX(i, j, n)];
+      code_min = (dist_curr < dist_min) ? j : code_min;
+      dist_min = (dist_curr < dist_min) ? dist_curr : dist_min;
+    }
+
+    // Transfer result to global memory
+    dists[i] = dist_min;
+    codes[i] = code_min;
+
+    // Increment cluster sizes
+    atomicAdd(clusterSizes + code_min, 1);
+
+    // Move to another row
+    i += blockDim.x * gridDim.x;
+  }
+}
+
+/** 
+ *  @brief Check if newly computed distances are smaller than old distances. 
+ *    Block and grid dimensions should be 1-dimensional. Ideally the
+ *    grid is large enough so there are n threads.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Number of observation vectors.
+ *  @param dists_old (Input/output, n entries) Distances between
+ *    observation vectors and closest centroids. On exit, entries
+ *    are replaced by entries in 'dists_new' if the corresponding
+ *    observation vectors are closest to the new centroid.
+ *  @param dists_new (Input, n entries) Distance between observation
+ *    vectors and new centroid.
+ *  @param codes_old (Input/output, n entries) Cluster
+ *    assignments. On exit, entries are replaced with 'code_new' if
+ *    the corresponding observation vectors are closest to the new
+ *    centroid.
+ *  @param code_new Index associated with new centroid.
+ */
+template <typename index_type_t, typename value_type_t>
+static __global__ void minDistances2(index_type_t n,
+                                     value_type_t* __restrict__ dists_old,
+                                     const value_type_t* __restrict__ dists_new,
+                                     index_type_t* __restrict__ codes_old,
+                                     index_type_t code_new) {
+  // Loop index
+  index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // Distances
+  value_type_t dist_old_private;
+  value_type_t dist_new_private;
+
+  // Each row is processed by a thread
+  while (i < n) {
+    // Get old and new distances
+    dist_old_private = dists_old[i];
+    dist_new_private = dists_new[i];
+
+    // Update if new distance is smaller than old distance
+    if (dist_new_private < dist_old_private) {
+      dists_old[i] = dist_new_private;
+      codes_old[i] = code_new;
+    }
+
+    // Move to another row
+    i += blockDim.x * gridDim.x;
+  }
+}
+
+/** 
+ *  @brief Compute size of k-means clusters.
+ *    Block and grid dimensions should be 1-dimensional. Ideally the
+ *    grid is large enough so there are n threads.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @param n Number of observation vectors.
+ *  @param k Number of clusters.
+ *  @param codes (Input, n entries) Cluster assignments.
+ *  @param clusterSizes (Output, k entries) Number of points in each
+ *    cluster. Entries must be initialized to zero.
+ */
+template <typename index_type_t>
+static __global__ void computeClusterSizes(
+  index_type_t n, index_type_t k, const index_type_t* __restrict__ codes,
+  index_type_t* __restrict__ clusterSizes) {
+  index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
+  while (i < n) {
+    atomicAdd(clusterSizes + codes[i], 1);
+    i += blockDim.x * gridDim.x;
+  }
+}
+
+/** 
+ *  @brief Divide rows of centroid matrix by cluster sizes. 
+ *    Divides the ith column of the sum matrix by the size of the ith
+ *    cluster. If the sum matrix has been initialized so that the ith
+ *    row is the sum of all observation vectors in the ith cluster,
+ *    this kernel produces cluster centroids. The grid and block
+ *    dimensions should be 2-dimensional. Ideally the grid is large
+ *    enough so there are d threads in the x-direction and k threads
+ *    in the y-direction.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param clusterSizes (Input, k entries) Number of points in each
+ *    cluster.
+ *  @param centroids (Input/output, d*k entries) Sum matrix. Matrix
+ *    is stored column-major and matrix dimensions are d x k. The
+ *    ith column is the sum of all observation vectors in the ith
+ *    cluster. On exit, the matrix is the centroid matrix (each
+ *    column is the mean position of a cluster).
+ */
+template <typename index_type_t, typename value_type_t>
+static __global__ void divideCentroids(
+  index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes,
+  value_type_t* __restrict__ centroids) {
+  // Global indices
+  index_type_t gidx, gidy;
+
+  // Current cluster size
+  index_type_t clusterSize_private;
+
+  // Observation vector is determined by global y-index
+  gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  while (gidy < k) {
+    // Get cluster size from global memory
+    clusterSize_private = clusterSizes[gidy];
+
+    // Add vector entries to centroid matrix
+    //   vector entris are determined by global x-index
+    gidx = threadIdx.x + blockIdx.x * blockDim.x;
+    while (gidx < d) {
+      centroids[IDX(gidx, gidy, d)] /= clusterSize_private;
+      gidx += blockDim.x * gridDim.x;
+    }
+
+    // Move to another centroid
+    gidy += blockDim.y * gridDim.y;
+  }
+}
+
+// =========================================================
+// Helper functions
+// =========================================================
+
+/**
+ *  @brief Randomly choose new centroids.
+ *    Centroid is randomly chosen with k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @tparam thrust_exe_pol_t the type of thrust execution policy.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param rand Random number drawn uniformly from [0,1).
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are n x d.
+ *  @param dists (Input, device memory, 2*n entries) Workspace. The
+ *    first n entries should be the distance between observation
+ *    vectors and the closest centroid.
+ *  @param centroid (Output, device memory, d entries) Centroid
+ *    coordinates.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+static int chooseNewCentroid(handle_t const& handle,
+                             thrust_exe_pol_t thrust_exec_policy,
+                             index_type_t n, index_type_t d, index_type_t k,
+                             value_type_t rand,
+                             const value_type_t* __restrict__ obs,
+                             value_type_t* __restrict__ dists,
+                             value_type_t* __restrict__ centroid) {
+  // Cumulative sum of distances
+  value_type_t* distsCumSum = dists + n;
+  // Residual sum of squares
+  value_type_t distsSum{0};
+  // Observation vector that is chosen as new centroid
+  index_type_t obsIndex;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // Compute cumulative sum of distances
+  thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
+                         thrust::device_pointer_cast(dists + n),
+                         thrust::device_pointer_cast(distsCumSum));
+  CHECK_CUDA(stream);
+  CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t),
+                           cudaMemcpyDeviceToHost, stream));
+
+  // Randomly choose observation vector
+  //   Probabilities are proportional to square of distance to closest
+  //   centroid (see k-means++ algorithm)
+  //
+  //seg-faults due to Thrust bug
+  //on binary-search-like algorithms
+  //when run with stream dependent
+  //execution policies; fixed on Thrust GitHub
+  //hence replace w/ linear interpolation,
+  //until the Thrust issue gets resolved:
+  //
+  // obsIndex = (thrust::lower_bound(
+  //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
+  //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
+  //             thrust::device_pointer_cast(distsCumSum));
+  //
+  //linear interpolation logic:
+  //{
+  value_type_t minSum{0};
+  CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t),
+                           cudaMemcpyDeviceToHost, stream));
+  CHECK_CUDA(stream);
+
+  if (distsSum > minSum) {
+    value_type_t vIndex = static_cast<value_type_t>(n - 1);
+    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) /
+                                         (distsSum - minSum));
+  } else {
+    obsIndex = 0;
+  }
+  //}
+
+  CHECK_CUDA(stream);
+  obsIndex = max(obsIndex, 0);
+  obsIndex = min(obsIndex, n - 1);
+
+  // Record new centroid position
+  CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d),
+                           d * sizeof(value_type_t), cudaMemcpyDeviceToDevice,
+                           stream));
+
+  return 0;
+}
+
+/**
+ *  @brief Choose initial cluster centroids for k-means algorithm.  
+ *    Centroids are randomly chosen with k-means++ algorithm
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @tparam thrust_exe_pol_t the type of thrust execution policy.
+ *  @param handle the raft handle.
+ *  @param  thrust_exec_policy thrust execution policy 
+ *    (assumed to have same stream as handle.stream).
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param centroids (Output, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Output, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param dists (Output, device memory, 2*n entries) Workspace. On
+ *    exit, the first n entries give the square of the Euclidean
+ *    distance between observation vectors and the closest centroid.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+static int initializeCentroids(
+  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
+  value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes,
+  index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists,
+  unsigned long long seed) {
+  // -------------------------------------------------------
+  // Variable declarations
+  // -------------------------------------------------------
+
+  // Loop index
+  index_type_t i;
+
+  // Random number generator
+  thrust::default_random_engine rng(seed);
+  thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  constexpr index_type_t grid_lower_bound{65535};
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Initialize grid dimensions
+  dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
+
+  // CUDA grid dimensions
+  dim3 gridDim_warp{
+    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
+    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+
+  // CUDA grid dimensions
+  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound),
+                     1, 1};
+
+  // Assign observation vectors to code 0
+  CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+
+  // Choose first centroid
+  thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
+               thrust::device_pointer_cast(dists + n), 1);
+  CHECK_CUDA(stream);
+  if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
+                        obs, dists, centroids))
+    WARNING("error in k-means++ (could not pick centroid)");
+
+  // Compute distances from first centroid
+  CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
+  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
+    n, d, 1, obs, centroids, dists);
+  CHECK_CUDA(stream);
+
+  // Choose remaining centroids
+  for (i = 1; i < k; ++i) {
+    // Choose ith centroid
+    if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
+                          obs, dists, centroids + IDX(0, i, d)))
+      WARNING("error in k-means++ (could not pick centroid)");
+
+    // Compute distances from ith centroid
+    CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
+    computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
+      n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
+    CHECK_CUDA(stream);
+
+    // Recompute minimum distances
+    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n,
+                                                            codes, i);
+    CHECK_CUDA(stream);
+  }
+
+  // Compute cluster sizes
+  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, k, codes,
+                                                                clusterSizes);
+  CHECK_CUDA(stream);
+
+  return 0;
+}
+
+/** 
+ *  @brief Find cluster centroids closest to observation vectors. 
+ *    Distance is measured with Euclidean norm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @tparam thrust_exe_pol_t the type of thrust execution policy.
+ *  @param handle the raft handle.
+ *  @param  thrust_exec_policy thrust execution policy
+ *    (assumed to have same stream as handle.stream).
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param centroids (Input, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param dists (Output, device memory, n*k entries) Workspace. On
+ *    exit, the first n entries give the square of the Euclidean
+ *    distance between observation vectors and the closest centroid.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Output, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param residual_host (Output, host memory, 1 entry) Residual sum
+ *    of squares of assignment.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+static int assignCentroids(
+  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
+  const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists,
+  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
+  value_type_t* residual_host) {
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // Compute distance between centroids and observation vectors
+  CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
+
+  // CUDA grid dimensions
+  dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
+
+  dim3 gridDim;
+  constexpr index_type_t grid_lower_bound{65535};
+  gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
+  gridDim.y = min(k, grid_lower_bound);
+  gridDim.z =
+    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+
+  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids,
+                                                     dists);
+  CHECK_CUDA(stream);
+
+  // Find centroid closest to each observation vector
+  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  blockDim.x = BLOCK_SIZE;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.y = 1;
+  gridDim.z = 1;
+  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes,
+                                                 clusterSizes);
+  CHECK_CUDA(stream);
+
+  // Compute residual sum of squares
+  *residual_host =
+    thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists),
+                   thrust::device_pointer_cast(dists + n));
+
+  return 0;
+}
+
+/** 
+ *  @brief Update cluster centroids for k-means algorithm. 
+ *    All clusters are assumed to be non-empty.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @tparam thrust_exe_pol_t the type of thrust execution policy.
+ *  @param handle the raft handle.
+ *  @param  thrust_exec_policy thrust execution policy
+ *    (assumed to have same stream as handle.stream).
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Input, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Input, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param centroids (Output, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param work (Output, device memory, n*d entries) Workspace.
+ *  @param work_int (Output, device memory, 2*d*n entries)
+ *    Workspace.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+static int updateCentroids(handle_t const& handle,
+                           thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+                           index_type_t d, index_type_t k,
+                           const value_type_t* __restrict__ obs,
+                           const index_type_t* __restrict__ codes,
+                           const index_type_t* __restrict__ clusterSizes,
+                           value_type_t* __restrict__ centroids,
+                           value_type_t* __restrict__ work,
+                           index_type_t* __restrict__ work_int) {
+  // -------------------------------------------------------
+  // Variable declarations
+  // -------------------------------------------------------
+
+  // Useful constants
+  const value_type_t one = 1;
+  const value_type_t zero = 0;
+
+  constexpr index_type_t grid_lower_bound{65535};
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // Device memory
+  thrust::device_ptr<value_type_t> obs_copy(work);
+  thrust::device_ptr<index_type_t> codes_copy(work_int);
+  thrust::device_ptr<index_type_t> rows(work_int + d * n);
+
+  // Take transpose of observation matrix
+  CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs,
+                          d, &zero, (value_type_t*)NULL, n,
+                          thrust::raw_pointer_cast(obs_copy), n, stream));
+
+  // Cluster assigned to each observation matrix entry
+  thrust::sequence(thrust_exec_policy, rows, rows + d * n);
+  CHECK_CUDA(stream);
+  thrust::transform(thrust_exec_policy, rows, rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n), rows,
+                    thrust::modulus<index_type_t>());
+  CHECK_CUDA(stream);
+  thrust::gather(thrust_exec_policy, rows, rows + d * n,
+                 thrust::device_pointer_cast(codes), codes_copy);
+  CHECK_CUDA(stream);
+
+  // Row associated with each observation matrix entry
+  thrust::sequence(thrust_exec_policy, rows, rows + d * n);
+  CHECK_CUDA(stream);
+  thrust::transform(thrust_exec_policy, rows, rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n), rows,
+                    thrust::divides<index_type_t>());
+  CHECK_CUDA(stream);
+
+  // Sort and reduce to add observation vectors in same cluster
+  thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n,
+                             make_zip_iterator(make_tuple(obs_copy, rows)));
+  CHECK_CUDA(stream);
+  thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy,
+                        codes_copy,  // Output to codes_copy is ignored
+                        thrust::device_pointer_cast(centroids));
+  CHECK_CUDA(stream);
+
+  // Divide sums by cluster size to get centroid matrix
+  //
+  // CUDA grid dimensions
+  dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
+
+  // CUDA grid dimensions
+  dim3 gridDim{
+    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+    min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1};
+
+  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes,
+                                                    centroids);
+  CHECK_CUDA(stream);
+
+  return 0;
+}
+
+}  // namespace
+
+namespace raft {
+
+// =========================================================
+// k-means algorithm
+// =========================================================
+
+/** 
+ *  @brief Find clusters with k-means algorithm. 
+ *    Initial centroids are chosen with k-means++ algorithm. Empty
+ *    clusters are reinitialized by choosing new centroids with
+ *    k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @tparam thrust_exe_pol_t the type of thrust execution policy.
+ *  @param handle the raft handle.
+ *  @param  thrust_exec_policy thrust execution policy
+ *    (assumed to have same stream as handle.stream).
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param tol Tolerance for convergence. k-means stops when the
+ *    change in residual divided by n is less than tol.
+ *  @param maxiter Maximum number of k-means iterations.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Output, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param centroids (Output, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param work (Output, device memory, n*max(k,d) entries)
+ *    Workspace.
+ *  @param work_int (Output, device memory, 2*d*n entries)
+ *    Workspace.
+ *  @param residual_host (Output, host memory, 1 entry) Residual sum
+ *    of squares (sum of squares of distances between observation
+ *    vectors and centroids).
+ *  @param iters_host (Output, host memory, 1 entry) Number of
+ *    k-means iterations.
+ *  @param seed random seed to be used.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
+           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
+           index_type_t maxiter, const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes,
+           index_type_t* __restrict__ clusterSizes,
+           value_type_t* __restrict__ centroids,
+           value_type_t* __restrict__ work, index_type_t* __restrict__ work_int,
+           value_type_t* residual_host, index_type_t* iters_host,
+           unsigned long long seed) {
+  // -------------------------------------------------------
+  // Variable declarations
+  // -------------------------------------------------------
+
+  // Current iteration
+  index_type_t iter;
+
+  constexpr index_type_t grid_lower_bound{65535};
+
+  // Residual sum of squares at previous iteration
+  value_type_t residualPrev = 0;
+
+  // Random number generator
+  thrust::default_random_engine rng(seed);
+  thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
+
+  // -------------------------------------------------------
+  // Initialization
+  // -------------------------------------------------------
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // Trivial cases
+  if (k == 1) {
+    CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+    CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
+                             cudaMemcpyHostToDevice, stream));
+    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
+                        clusterSizes, centroids, work, work_int))
+      WARNING("could not compute k-means centroids");
+
+    dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
+
+    dim3 gridDim{
+      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
+      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE),
+          grid_lower_bound)};
+
+    CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
+    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids,
+                                                       work);
+    CHECK_CUDA(stream);
+    *residual_host =
+      thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work),
+                     thrust::device_pointer_cast(work + n));
+    CHECK_CUDA(stream);
+    return 0;
+  }
+  if (n <= k) {
+    thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes),
+                     thrust::device_pointer_cast(codes + n));
+    CHECK_CUDA(stream);
+    thrust::fill_n(thrust_exec_policy,
+                   thrust::device_pointer_cast(clusterSizes), n, 1);
+    CHECK_CUDA(stream);
+
+    if (n < k)
+      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0,
+                               (k - n) * sizeof(index_type_t), stream));
+    CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t),
+                             cudaMemcpyDeviceToDevice, stream));
+    *residual_host = 0;
+    return 0;
+  }
+
+  // Initialize cuBLAS
+  CUBLAS_CHECK(
+    linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // k-means++ algorithm
+  // -------------------------------------------------------
+
+  // Choose initial cluster centroids
+  if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
+                          codes, clusterSizes, work, seed))
+    WARNING("could not initialize k-means centroids");
+
+  // Apply k-means iteration until convergence
+  for (iter = 0; iter < maxiter; ++iter) {
+    // Update cluster centroids
+    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
+                        clusterSizes, centroids, work, work_int))
+      WARNING("could not update k-means centroids");
+
+    // Determine centroid closest to each observation
+    residualPrev = *residual_host;
+    if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
+                        work, codes, clusterSizes, residual_host))
+      WARNING("could not assign observation vectors to k-means clusters");
+
+    // Reinitialize empty clusters with new centroids
+    index_type_t emptyCentroid =
+      (thrust::find(thrust_exec_policy,
+                    thrust::device_pointer_cast(clusterSizes),
+                    thrust::device_pointer_cast(clusterSizes + k), 0) -
+       thrust::device_pointer_cast(clusterSizes));
+
+    // FIXME: emptyCentroid never reaches k (infinite loop) under certain
+    // conditions, such as if obs is corrupt (as seen as a result of a
+    // DataFrame column of NULL edge vals used to create the Graph)
+    while (emptyCentroid < k) {
+      if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k,
+                            uniformDist(rng), obs, work,
+                            centroids + IDX(0, emptyCentroid, d)))
+        WARNING("could not replace empty centroid");
+      if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
+                          work, codes, clusterSizes, residual_host))
+        WARNING("could not assign observation vectors to k-means clusters");
+      emptyCentroid =
+        (thrust::find(thrust_exec_policy,
+                      thrust::device_pointer_cast(clusterSizes),
+                      thrust::device_pointer_cast(clusterSizes + k), 0) -
+         thrust::device_pointer_cast(clusterSizes));
+      CHECK_CUDA(stream);
+    }
+
+    // Check for convergence
+    if (std::fabs(residualPrev - (*residual_host)) / n < tol) {
+      ++iter;
+      break;
+    }
+  }
+
+  // Warning if k-means has failed to converge
+  if (std::fabs(residualPrev - (*residual_host)) / n >= tol)
+    WARNING("k-means failed to converge");
+
+  *iters_host = iter;
+  return 0;
+}
+
+/** 
+ *  @brief Find clusters with k-means algorithm.
+ *    Initial centroids are chosen with k-means++ algorithm. Empty
+ *    clusters are reinitialized by choosing new centroids with
+ *    k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @tparam thrust_exe_pol_t the type of thrust execution policy.
+ *  @param handle the raft handle.
+ *  @param  thrust_exec_policy thrust execution policy
+ *    (assumed to have same stream as handle.stream).
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param tol Tolerance for convergence. k-means stops when the
+ *    change in residual divided by n is less than tol.
+ *  @param maxiter Maximum number of k-means iterations.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param residual On exit, residual sum of squares (sum of squares
+ *    of distances between observation vectors and centroids).
+ *  @param iters on exit, number of k-means iterations.
+ *  @param seed random seed to be used.
+ *  @return error flag
+ */
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
+           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
+           index_type_t maxiter, const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes, value_type_t& residual,
+           index_type_t& iters, unsigned long long seed = 123456) {
+  using namespace matrix;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(n > 0, "invalid parameter (n<1)");
+  RAFT_EXPECTS(d > 0, "invalid parameter (d<1)");
+  RAFT_EXPECTS(k > 0, "invalid parameter (k<1)");
+  RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)");
+  RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)");
+
+  // Allocate memory
+  vector_t<index_type_t> clusterSizes(handle, k);
+  vector_t<value_type_t> centroids(handle, d * k);
+  vector_t<value_type_t> work(handle, n * max(k, d));
+  vector_t<index_type_t> work_int(handle, 2 * d * n);
+
+  // Perform k-means
+  return kmeans<index_type_t, value_type_t>(
+    handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes,
+    clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual,
+    &iters, seed);
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
new file mode 100644
index 0000000000..d14bf05f37
--- /dev/null
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cusolverDn.h>
+
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/error.hpp>
+
+//for now; TODO: check if/where this `define` should be;
+//
+#define USE_LAPACK
+
+namespace raft {
+
+#define lapackCheckError(status)                        \
+  {                                                     \
+    if (status < 0) {                                   \
+      std::stringstream ss;                             \
+      ss << "Lapack error: argument number " << -status \
+         << " had an illegal value.";                   \
+      throw exception(ss.str());                        \
+    } else if (status > 0)                              \
+      RAFT_FAIL("Lapack error: internal error.");       \
+  }
+
+extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau,
+                        float *work, int *lwork, int *info);
+extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau,
+                        double *work, int *lwork, int *info);
+extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k,
+                        float *a, int *lda, const float *tau, float *c,
+                        int *ldc, float *work, int *lwork, int *info);
+extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k,
+                        double *a, int *lda, const double *tau, double *c,
+                        int *ldc, double *work, int *lwork, int *info);
+extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
+                      double *wr, double *wi, double *vl, int *ldvl, double *vr,
+                      int *ldvr, double *work, int *lwork, int *info);
+
+extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
+                      float *wr, float *wi, float *vl, int *ldvl, float *vr,
+                      int *ldvr, float *work, int *lwork, int *info);
+
+extern "C" cusolverStatus_t cusolverDnSgemmHost(
+  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
+  const float *alpha, const float *A, int lda, const float *B, int ldb,
+  const float *beta, float *C, int ldc);
+
+extern "C" cusolverStatus_t cusolverDnDgemmHost(
+  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
+  const double *alpha, const double *A, int lda, const double *B, int ldb,
+  const double *beta, double *C, int ldc);
+
+extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e,
+                                                 int *info);
+
+extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e,
+                                                 int *info);
+
+extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz,
+                                                 int n, float *d, float *e,
+                                                 float *z, int ldz, float *work,
+                                                 int *info);
+
+extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz,
+                                                 int n, double *d, double *e,
+                                                 double *z, int ldz,
+                                                 double *work, int *info);
+
+template <typename T>
+class Lapack {
+ private:
+  Lapack();
+  ~Lapack();
+
+ public:
+  static void check_lapack_enabled();
+
+  static void gemm(bool transa, bool transb, int m, int n, int k, T alpha,
+                   const T *A, int lda, const T *B, int ldb, T beta, T *C,
+                   int ldc);
+
+  // special QR for lanczos
+  static void sterf(int n, T *d, T *e);
+  static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work);
+
+  // QR
+  // computes the QR factorization of a general matrix
+  static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork);
+  // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf.
+
+  // multiply C by implicit Q
+  static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
+                    int lda, T *tau, T *c, int ldc, T *work, int *lwork);
+
+  static void geev(T *A, T *eigenvalues, int dim, int lda);
+  static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
+                   int ldvr);
+  static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r,
+                   T *eigenvectors_i, int dim, int lda, int ldvr);
+
+ private:
+  static void lapack_gemm(const char transa, const char transb, int m, int n,
+                          int k, float alpha, const float *a, int lda,
+                          const float *b, int ldb, float beta, float *c,
+                          int ldc) {
+    cublasOperation_t cublas_transa =
+      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb =
+      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
+                        (float *)a, lda, (float *)b, ldb, &beta, c, ldc);
+  }
+
+  static void lapack_gemm(const signed char transa, const signed char transb,
+                          int m, int n, int k, double alpha, const double *a,
+                          int lda, const double *b, int ldb, double beta,
+                          double *c, int ldc) {
+    cublasOperation_t cublas_transa =
+      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb =
+      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
+                        (double *)a, lda, (double *)b, ldb, &beta, c, ldc);
+  }
+
+  static void lapack_sterf(int n, float *d, float *e, int *info) {
+    cusolverDnSsterfHost(n, d, e, info);
+  }
+
+  static void lapack_sterf(int n, double *d, double *e, int *info) {
+    cusolverDnDsterfHost(n, d, e, info);
+  }
+
+  static void lapack_steqr(const signed char compz, int n, float *d, float *e,
+                           float *z, int ldz, float *work, int *info) {
+    cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info);
+  }
+
+  static void lapack_steqr(const signed char compz, int n, double *d, double *e,
+                           double *z, int ldz, double *work, int *info) {
+    cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info);
+  }
+
+  static void lapack_geqrf(int m, int n, float *a, int lda, float *tau,
+                           float *work, int *lwork, int *info) {
+    sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
+  }
+
+  static void lapack_geqrf(int m, int n, double *a, int lda, double *tau,
+                           double *work, int *lwork, int *info) {
+    dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
+  }
+
+  static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a,
+                           int lda, float *tau, float *c, int ldc, float *work,
+                           int *lwork, int *info) {
+    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
+            info);
+  }
+
+  static void lapack_ormqr(char side, char trans, int m, int n, int k,
+                           double *a, int lda, double *tau, double *c, int ldc,
+                           double *work, int *lwork, int *info) {
+    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
+            info);
+  }
+
+  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a,
+                                  int *lda, double *wr, double *wi, double *vl,
+                                  int *ldvl, double *vr, int *ldvr,
+                                  double *work, int *lwork, int *info) {
+    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
+                  lwork, info);
+  }
+
+  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a,
+                                  int *lda, float *wr, float *wi, float *vl,
+                                  int *ldvl, float *vr, int *ldvr, float *work,
+                                  int *lwork, int *info) {
+    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
+                  lwork, info);
+  }
+
+  // real eigenvalues
+  static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) {
+    char job = 'N';
+    std::vector<T> WI(dim);
+    int ldv = 1;
+    T *vl = 0;
+    int work_size = 6 * dim;
+    std::vector<T> work(work_size);
+    int info;
+    lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl,
+                         &ldv, vl, &ldv, work.data(), &work_size, &info);
+    lapackCheckError(info);
+  }
+
+  // real eigenpairs
+  static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim,
+                          int lda, int ldvr) {
+    char jobvl = 'N';
+    char jobvr = 'V';
+    std::vector<T> WI(dim);
+    int work_size = 6 * dim;
+    T *vl = 0;
+    int ldvl = 1;
+    std::vector<T> work(work_size);
+    int info;
+    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(),
+                         vl, &ldvl, eigenvectors, &ldvr, work.data(),
+                         &work_size, &info);
+    lapackCheckError(info);
+  }
+
+  // complex eigenpairs
+  static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
+                          T *eigenvectors_r, T *eigenvectors_i, int dim,
+                          int lda, int ldvr) {
+    char jobvl = 'N';
+    char jobvr = 'V';
+    int work_size = 8 * dim;
+    int ldvl = 1;
+    std::vector<T> work(work_size);
+    int info;
+    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r,
+                         eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr,
+                         work.data(), &work_size, &info);
+    lapackCheckError(info);
+  }
+};
+
+template <typename T>
+void Lapack<T>::check_lapack_enabled() {
+#ifndef USE_LAPACK
+  RAFT_FAIL("Error: LAPACK not enabled.");
+#endif
+}
+
+template <typename T>
+void Lapack<T>::gemm(bool transa, bool transb, int m, int n, int k, T alpha,
+                     const T *A, int lda, const T *B, int ldb, T beta, T *C,
+                     int ldc) {
+  // check_lapack_enabled();
+  //#ifdef NVGRAPH_USE_LAPACK
+  const char transA_char = transa ? 'T' : 'N';
+  const char transB_char = transb ? 'T' : 'N';
+  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C,
+              ldc);
+  //#endif
+}
+
+template <typename T>
+void Lapack<T>::sterf(int n, T *d, T *e) {
+  //    check_lapack_enabled();
+  //#ifdef NVGRAPH_USE_LAPACK
+  int info;
+  lapack_sterf(n, d, e, &info);
+  lapackCheckError(info);
+  //#endif
+}
+
+template <typename T>
+void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
+  //    check_lapack_enabled();
+  //#ifdef NVGRAPH_USE_LAPACK
+  int info;
+  lapack_steqr(compz, n, d, e, z, ldz, work, &info);
+  lapackCheckError(info);
+  //#endif
+}
+
+template <typename T>
+void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
+                      int *lwork) {
+  check_lapack_enabled();
+#ifdef USE_LAPACK
+  int info;
+  lapack_geqrf(m, n, a, lda, tau, work, lwork, &info);
+  lapackCheckError(info);
+#endif
+}
+template <typename T>
+void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
+                      int lda, T *tau, T *c, int ldc, T *work, int *lwork) {
+  check_lapack_enabled();
+#ifdef USE_LAPACK
+  char side = right_side ? 'R' : 'L';
+  char trans = transq ? 'T' : 'N';
+  int info;
+  lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
+  lapackCheckError(info);
+#endif
+}
+
+// real eigenvalues
+template <typename T>
+void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
+  check_lapack_enabled();
+#ifdef USE_LAPACK
+  lapack_geev(A, eigenvalues, dim, lda);
+#endif
+}
+// real eigenpairs
+template <typename T>
+void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
+                     int ldvr) {
+  check_lapack_enabled();
+#ifdef USE_LAPACK
+  lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr);
+#endif
+}
+// complex eigenpairs
+template <typename T>
+void Lapack<T>::geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
+                     T *eigenvectors_r, T *eigenvectors_i, int dim, int lda,
+                     int ldvr) {
+  check_lapack_enabled();
+#ifdef USE_LAPACK
+  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i,
+              dim, lda, ldvr);
+#endif
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
new file mode 100644
index 0000000000..08f213cd3a
--- /dev/null
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/handle.hpp>
+#include <raft/utils/sm_utils.hpp>
+
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+
+#include <algorithm>
+
+// =========================================================
+// Useful macros
+// =========================================================
+
+// Get index of matrix entry
+#define IDX(i, j, lda) ((i) + (j) * (lda))
+
+namespace raft {
+namespace matrix {
+
+using size_type = int;  // for now; TODO: move it in appropriate header
+
+// Vector "view"-like aggregate for linear algebra purposes
+//
+template <typename value_type>
+struct vector_view_t {
+  value_type* buffer_;
+  size_type size_;
+
+  vector_view_t(value_type* buffer, size_type sz)
+    : buffer_(buffer), size_(sz) {}
+
+  vector_view_t(vector_view_t&& other)
+    : buffer_(other.buffer_), size_(other.size_) {
+    other.buffer_ = nullptr;
+    other.size_ = 0;
+  }
+
+  vector_view_t& operator=(vector_view_t&& other) {
+    buffer_ = other.buffer_;
+    size_ = other.size_;
+
+    other.buffer_ = nullptr;
+    other.size_ = 0;
+  }
+};
+
+// allocatable vector, using raft handle allocator
+//
+template <typename value_type>
+class vector_t {
+  handle_t const& handle_;
+  value_type* buffer_;
+  size_type size_;
+  cudaStream_t stream_;
+
+ public:
+  vector_t(handle_t const& raft_handle, size_type sz)
+    : handle_(raft_handle),
+      buffer_(
+        static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
+          sz * sizeof(value_type), raft_handle.get_stream()))),
+      size_(sz),
+      stream_(raft_handle.get_stream()) {}
+
+  ~vector_t(void) {
+    handle_.get_device_allocator()->deallocate(buffer_, size_, stream_);
+  }
+
+  size_type size(void) const { return size_; }
+
+  value_type* raw(void) { return buffer_; }
+
+  value_type const* raw(void) const { return buffer_; }
+
+  template <typename ThrustExecPolicy>
+  value_type nrm1(ThrustExecPolicy t_exe_pol) const {
+    return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0},
+                          [] __device__(auto left, auto right) {
+                            auto abs_left = left > 0 ? left : -left;
+                            auto abs_right = right > 0 ? right : -right;
+                            return abs_left + abs_right;
+                          });
+  }
+
+  template <typename ThrustExecPolicy>
+  void fill(ThrustExecPolicy t_exe_pol, value_type value) {
+    thrust::fill_n(t_exe_pol, buffer_, size_, value);
+  }
+};
+
+template <typename index_type, typename value_type>
+struct sparse_matrix_t {
+  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
+                  index_type const* col_indices, value_type const* values,
+                  index_type const nrows, index_type const nnz)
+    : handle_(raft_handle),
+      row_offsets_(row_offsets),
+      col_indices_(col_indices),
+      values_(values),
+      nrows_(nrows),
+      nnz_(nnz) {}
+
+  template <typename CSRView>
+  sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
+    : handle_(raft_handle),
+      row_offsets_(csr_view.offsets),
+      col_indices_(csr_view.indices),
+      values_(csr_view.edge_data),
+      nrows_(csr_view.number_of_vertices),
+      nnz_(csr_view.number_of_edges) {}
+
+  virtual ~sparse_matrix_t(void) =
+    default;  // virtual because used as base for following matrix types
+
+  // y = alpha*A*x + beta*y
+  //(Note: removed const-ness of x, because CUDA 11 SpMV
+  // descriptor creation works with non-const, and const-casting
+  // down is dangerous)
+  //
+  virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+                  value_type* __restrict__ y, bool transpose = false,
+                  bool symmetric = false) const {
+    using namespace sparse;
+
+    RAFT_EXPECTS(x != nullptr, "Null x buffer.");
+    RAFT_EXPECTS(y != nullptr, "Null y buffer.");
+
+    auto cusparse_h = handle_.get_cusparse_handle();
+    auto stream = handle_.get_stream();
+
+    cusparseOperation_t trans =
+      transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
+        CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
+
+#if __CUDACC_VER_MAJOR__ > 10
+
+    //create descriptors:
+    //
+    cusparseSpMatDescr_t matA;
+    CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, nrows_, nnz_, row_offsets_,
+                                     col_indices_, values_));
+
+    cusparseDnVecDescr_t vecX;
+    CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, x));
+
+    cusparseDnVecDescr_t vecY;
+    CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y));
+
+    //get (scratch) external device buffer size:
+    //
+    size_t bufferSize;
+    CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, opA, &alpha, matA, vecX,
+                                           &beta, vecY, alg, &bufferSize,
+                                           stream));
+
+    //allocate external buffer:
+    //
+    vector_t<value_type> external_buffer(handle_, bufferSize);
+
+    //finally perform SpMV:
+    //
+    CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta,
+                                vecY, CUSPARSE_CSRMV_ALG1,
+                                external_buffer.raw(), stream));
+
+    //free descriptors:
+    //(TODO: maybe wrap them in a RAII struct?)
+    //
+    CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
+    CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
+    CUSPARSE_CHECK(cusparseDestroySpMat(matA));
+#else
+    CUSPARSE_CHECK(
+      cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    cusparseMatDescr_t descr = 0;
+    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
+    if (symmetric) {
+      CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC));
+    } else {
+      CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+    }
+    CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_,
+                                 &alpha, descr, values_, row_offsets_,
+                                 col_indices_, x, &beta, y, stream));
+    CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
+#endif
+  }
+
+  handle_t const& get_handle(void) const { return handle_; }
+
+  //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate
+
+  handle_t const& handle_;
+  index_type const* row_offsets_;
+  index_type const* col_indices_;
+  value_type const* values_;
+  index_type const nrows_;
+  index_type const nnz_;
+};
+
+template <typename index_type, typename value_type>
+struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
+  template <typename ThrustExePolicy>
+  laplacian_matrix_t(handle_t const& raft_handle,
+                     ThrustExePolicy thrust_exec_policy,
+                     index_type const* row_offsets,
+                     index_type const* col_indices, value_type const* values,
+                     index_type const nrows, index_type const nnz)
+    : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
+                                              col_indices, values, nrows, nnz),
+      diagonal_(raft_handle, nrows) {
+    vector_t<value_type> ones{raft_handle, nrows};
+    ones.fill(thrust_exec_policy, 1.0);
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
+                                                diagonal_.raw());
+  }
+
+  template <typename ThrustExePolicy>
+  laplacian_matrix_t(handle_t const& raft_handle,
+                     ThrustExePolicy thrust_exec_policy,
+                     sparse_matrix_t<index_type, value_type> const& csr_m)
+    : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
+                                              csr_m.col_indices_, csr_m.values_,
+                                              csr_m.nrows_, csr_m.nnz_),
+      diagonal_(raft_handle, csr_m.nrows_) {
+    vector_t<value_type> ones{raft_handle, csr_m.nrows_};
+    ones.fill(thrust_exec_policy, 1.0);
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
+                                                diagonal_.raw());
+  }
+
+  // y = alpha*A*x + beta*y
+  //
+  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+          value_type* __restrict__ y, bool transpose = false,
+          bool symmetric = false) const override {
+    constexpr int BLOCK_SIZE = 1024;
+    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
+
+    auto cublas_h =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+
+    // scales y by beta:
+    //
+    if (beta == 0) {
+      CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
+    } else if (beta != 1) {
+      CUBLAS_CHECK(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream));
+    }
+
+    // Apply diagonal matrix
+    //
+    dim3 gridDim{
+      std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+
+    dim3 blockDim{BLOCK_SIZE, 1, 1};
+    utils::diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(),
+                                                    x, y);
+    CHECK_CUDA(stream);
+
+    // Apply adjacency matrix
+    //
+    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y);
+  }
+
+  vector_t<value_type> diagonal_;
+};
+
+template <typename index_type, typename value_type>
+struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
+  template <typename ThrustExePolicy>
+  modularity_matrix_t(handle_t const& raft_handle,
+                      ThrustExePolicy thrust_exec_policy,
+                      index_type const* row_offsets,
+                      index_type const* col_indices, value_type const* values,
+                      index_type const nrows, index_type const nnz)
+    : laplacian_matrix_t<index_type, value_type>(
+        raft_handle, thrust_exec_policy, row_offsets, col_indices, values,
+        nrows, nnz) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
+      thrust_exec_policy);
+  }
+
+  template <typename ThrustExePolicy>
+  modularity_matrix_t(handle_t const& raft_handle,
+                      ThrustExePolicy thrust_exec_policy,
+                      sparse_matrix_t<index_type, value_type> const& csr_m)
+    : laplacian_matrix_t<index_type, value_type>(raft_handle,
+                                                 thrust_exec_policy, csr_m) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
+      thrust_exec_policy);
+  }
+
+  // y = alpha*A*x + beta*y
+  //
+  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+          value_type* __restrict__ y, bool transpose = false,
+          bool symmetric = false) const override {
+    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
+
+    auto cublas_h =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+
+    // y = A*x
+    //
+    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y);
+    value_type dot_res;
+
+    // gamma = d'*x
+    //
+    // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
+    CUBLAS_CHECK(linalg::cublasdot(
+      cublas_h, n, laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+      1, x, 1, &dot_res, stream));
+
+    // y = y -(gamma/edge_sum)*d
+    //
+    value_type gamma_ = -dot_res / edge_sum_;
+    CUBLAS_CHECK(linalg::cublasaxpy(
+      cublas_h, n, &gamma_,
+      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(), 1, y, 1,
+      stream));
+  }
+
+  value_type edge_sum_;
+};
+
+}  // namespace matrix
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
new file mode 100644
index 0000000000..f8dfe5daa3
--- /dev/null
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/spectral_util.hpp>
+
+#ifdef COLLECT_TIME_STATISTICS
+#include <cuda_profiler_api.h>
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+#endif
+
+#ifdef COLLECT_TIME_STATISTICS
+static double timer(void) {
+  struct timeval tv;
+  cudaDeviceSynchronize();
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+#endif
+
+namespace raft {
+namespace spectral {
+
+using namespace matrix;
+using namespace linalg;
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
+          typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
+  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
+  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // Compute eigenvectors of Modularity Matrix
+
+  // Initialize Modularity Matrix
+  modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs = eigen_config.n_eigVecs;
+
+  // Compute eigenvectors corresponding to largest eigenvalues
+  std::get<0>(stats) =
+    eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
+
+  // notice that at this point the matrix has already been transposed, so we are scaling
+  // columns
+  scale_obs(nEigVecs, n, eigVecs);
+  CHECK_CUDA(stream);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
+                                           nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
+void analyzeModularity(handle_t const &handle,
+                       ThrustExePolicy thrust_exec_policy,
+                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+                       vertex_t nClusters,
+                       vertex_t const *__restrict__ clusters,
+                       weight_t &modularity) {
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+  weight_t partModularity, clustersize;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  // Device memory
+  vector_t<weight_t> part_i(handle, n);
+  vector_t<weight_t> Bx(handle, n);
+
+  // Initialize cuBLAS
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Modularity
+  modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
+
+  // Initialize output
+  modularity = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
+                             partModularity, clusters, part_i, Bx, B)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    modularity += partModularity;
+  }
+
+  modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy);
+}
+
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
new file mode 100644
index 0000000000..841fca04d9
--- /dev/null
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/spectral_util.hpp>
+
+namespace raft {
+namespace spectral {
+
+using namespace matrix;
+using namespace linalg;
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
+          typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
+  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
+  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // -------------------------------------------------------
+  // Spectral partitioner
+  // -------------------------------------------------------
+
+  // Compute eigenvectors of Laplacian
+
+  // Initialize Laplacian
+  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs = eigen_config.n_eigVecs;
+
+  // Compute smallest eigenvalues and eigenvectors
+  std::get<0>(stats) =
+    eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
+                                           nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
+void analyzePartition(handle_t const &handle,
+                      ThrustExePolicy thrust_exec_policy,
+                      sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+                      vertex_t nClusters, const vertex_t *__restrict__ clusters,
+                      weight_t &edgeCut, weight_t &cost) {
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  weight_t partEdgesCut, clustersize;
+
+  // Device memory
+  vector_t<weight_t> part_i(handle, n);
+  vector_t<weight_t> Lx(handle, n);
+
+  // Initialize cuBLAS
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Laplacian
+  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
+
+  // Initialize output
+  cost = 0;
+  edgeCut = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    // Construct indicator vector for ith partition
+    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
+                             partEdgesCut, clusters, part_i, Lx, L)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    cost += partEdgesCut / clustersize;
+    edgeCut += partEdgesCut / 2;
+  }
+}
+
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
new file mode 100644
index 0000000000..b77375b33b
--- /dev/null
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/utils/sm_utils.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+namespace raft {
+namespace spectral {
+
+template <typename index_type_t, typename value_type_t>
+static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
+                                        value_type_t* obs) {
+  index_type_t i, j, k, index, mm;
+  value_type_t alpha, v, last;
+  bool valid;
+  // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
+
+  // compute alpha
+  mm = (((m + blockDim.x - 1) / blockDim.x) *
+        blockDim.x);  // m in multiple of blockDim.x
+  alpha = 0.0;
+
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
+       j += blockDim.y * gridDim.y) {
+    for (i = threadIdx.x; i < mm; i += blockDim.x) {
+      // check if the thread is valid
+      valid = i < m;
+
+      // get the value of the last thread
+      last = utils::shfl(alpha, blockDim.x - 1, blockDim.x);
+
+      // if you are valid read the value from memory, otherwise set your value to 0
+      alpha = (valid) ? obs[i + j * m] : 0.0;
+      alpha = alpha * alpha;
+
+      // do prefix sum (of size warpSize=blockDim.x =< 32)
+      for (k = 1; k < blockDim.x; k *= 2) {
+        v = utils::shfl_up(alpha, k, blockDim.x);
+        if (threadIdx.x >= k) alpha += v;
+      }
+      // shift by last
+      alpha += last;
+    }
+  }
+
+  // scale by alpha
+  alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x);
+  alpha = std::sqrt(alpha);
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
+       j += blockDim.y * gridDim.y) {
+    for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
+      index = i + j * m;
+      obs[index] = obs[index] / alpha;
+    }
+  }
+}
+
+template <typename index_type_t>
+index_type_t next_pow2(index_type_t n) {
+  index_type_t v;
+  // Reference:
+  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
+  v = n - 1;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return v + 1;
+}
+
+template <typename index_type_t, typename value_type_t>
+cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
+  index_type_t p2m;
+
+  // find next power of 2
+  p2m = next_pow2<index_type_t>(m);
+  // setup launch configuration
+  unsigned int xsize = max(2, min(p2m, 32));
+  dim3 nthreads{xsize, 256 / xsize, 1};
+
+  dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
+
+  // launch scaling kernel (scale each column of obs by its norm)
+  scale_obs_kernel<index_type_t, value_type_t>
+    <<<nblocks, nthreads>>>(m, n, obs);
+
+  return cudaSuccess;
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename ThrustExePolicy>
+void transform_eigen_matrix(handle_t const& handle,
+                            ThrustExePolicy thrust_exec_policy, edge_t n,
+                            vertex_t nEigVecs, weight_t* eigVecs) {
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  const weight_t zero{0.0};
+  const weight_t one{1.0};
+
+  // Whiten eigenvector matrix
+  for (auto i = 0; i < nEigVecs; ++i) {
+    weight_t mean, std;
+
+    mean = thrust::reduce(
+      thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    CHECK_CUDA(stream);
+    mean /= n;
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(mean),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::minus<weight_t>());
+    CHECK_CUDA(stream);
+
+    CUBLAS_CHECK(
+      cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+
+    std /= std::sqrt(static_cast<weight_t>(n));
+
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(std),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::divides<weight_t>());
+    CHECK_CUDA(stream);
+  }
+
+  // Transpose eigenvector matrix
+  //   TODO: in-place transpose
+  {
+    vector_t<weight_t> work(handle, nEigVecs * n);
+    CUBLAS_CHECK(
+      cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n,
+                            &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs,
+                            work.raw(), nEigVecs, stream));
+
+    CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(),
+                             nEigVecs * n * sizeof(weight_t),
+                             cudaMemcpyDeviceToDevice, stream));
+  }
+}
+
+namespace {
+/// Functor to generate indicator vectors
+/** For use in Thrust transform
+ */
+template <typename index_type_t, typename value_type_t>
+struct equal_to_i_op {
+  const index_type_t i;
+
+ public:
+  equal_to_i_op(index_type_t _i) : i(_i) {}
+  template <typename Tuple_>
+  __host__ __device__ void operator()(Tuple_ t) {
+    thrust::get<1>(t) =
+      (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  }
+};
+}  // namespace
+
+// Construct indicator vector for ith partition
+//
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename ThrustExePolicy>
+bool construct_indicator(handle_t const& handle,
+                         ThrustExePolicy thrust_exec_policy, edge_t index,
+                         edge_t n, weight_t& clustersize, weight_t& partStats,
+                         vertex_t const* __restrict__ clusters,
+                         vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
+                         laplacian_matrix_t<vertex_t, weight_t> const& B) {
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream = handle.get_stream();
+
+  thrust::for_each(thrust_exec_policy,
+                   thrust::make_zip_iterator(thrust::make_tuple(
+                     thrust::device_pointer_cast(clusters),
+                     thrust::device_pointer_cast(part_i.raw()))),
+                   thrust::make_zip_iterator(thrust::make_tuple(
+                     thrust::device_pointer_cast(clusters + n),
+                     thrust::device_pointer_cast(part_i.raw() + n))),
+                   equal_to_i_op<vertex_t, weight_t>(index));
+  CHECK_CUDA(stream);
+
+  // Compute size of ith partition
+  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1,
+                         &clustersize, stream));
+
+  clustersize = round(clustersize);
+  if (clustersize < 0.5) {
+    return false;
+  }
+
+  // Compute part stats
+  B.mv(1, part_i.raw(), 0, Bx.raw());
+  CUBLAS_CHECK(
+    cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+
+  return true;
+}
+
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp
new file mode 100644
index 0000000000..406f1b7c7e
--- /dev/null
+++ b/cpp/include/raft/spectral/warn_dbg.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#define STRINGIFY_DETAIL(x) #x
+#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x)
+
+#ifdef DEBUG
+#define COUT() (std::cout)
+#define CERR() (std::cerr)
+
+//nope:
+//
+#define WARNING(message)                                                  \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message; \
+    CERR() << ss.str() << std::endl;                                      \
+  } while (0)
+#else  // DEBUG
+#define WARNING(message)
+#endif
diff --git a/cpp/include/raft/utils/sm_utils.hpp b/cpp/include/raft/utils/sm_utils.hpp
new file mode 100644
index 0000000000..34eeec16bd
--- /dev/null
+++ b/cpp/include/raft/utils/sm_utils.hpp
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef _MSC_VER
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+#define DEFAULT_MASK 0xffffffff
+
+#define USE_CG 1
+//(__CUDACC_VER__ >= 80500)
+
+namespace raft {
+namespace utils {
+static __device__ __forceinline__ int lane_id() {
+  int id;
+  asm("mov.u32 %0, %%laneid;" : "=r"(id));
+  return id;
+}
+
+static __device__ __forceinline__ int lane_mask_lt() {
+  int mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+
+static __device__ __forceinline__ int lane_mask_le() {
+  int mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+
+static __device__ __forceinline__ int warp_id() { return threadIdx.x >> 5; }
+
+static __device__ __forceinline__ unsigned int ballot(int p,
+                                                      int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#if USE_CG
+  return __ballot_sync(mask, p);
+#else
+  return __ballot(p);
+#endif
+#else
+  return 0;
+#endif
+}
+
+static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32,
+                                           int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#if USE_CG
+  return __shfl_sync(mask, r, lane, bound);
+#else
+  return __shfl(r, lane, bound);
+#endif
+#else
+  return 0;
+#endif
+}
+
+static __device__ __forceinline__ float shfl(float r, int lane, int bound = 32,
+                                             int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#if USE_CG
+  return __shfl_sync(mask, r, lane, bound);
+#else
+  return __shfl(r, lane, bound);
+#endif
+#else
+  return 0.0f;
+#endif
+}
+
+/// Warp shuffle down function
+/** Warp shuffle functions on 64-bit floating point values are not
+ *  natively implemented as of Compute Capability 5.0. This
+ *  implementation has been copied from
+ *  (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler).
+ *  Once this is natively implemented, this function can be replaced
+ *  by __shfl_down.
+ *
+ */
+static __device__ __forceinline__ double shfl(double r, int lane,
+                                              int bound = 32,
+                                              int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_sync(mask, a.x, lane, bound);
+  a.y = __shfl_sync(mask, a.y, lane, bound);
+  return *reinterpret_cast<double *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl(a.x, lane, bound);
+  a.y = __shfl(a.y, lane, bound);
+  return *reinterpret_cast<double *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+static __device__ __forceinline__ long long shfl(long long r, int lane,
+                                                 int bound = 32,
+                                                 int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_sync(mask, a.x, lane, bound);
+  a.y = __shfl_sync(mask, a.y, lane, bound);
+  return *reinterpret_cast<long long *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl(a.x, lane, bound);
+  a.y = __shfl(a.y, lane, bound);
+  return *reinterpret_cast<long long *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+static __device__ __forceinline__ int shfl_down(int r, int offset,
+                                                int bound = 32,
+                                                int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  return __shfl_down_sync(mask, r, offset, bound);
+#else
+  return __shfl_down(r, offset, bound);
+#endif
+#else
+  return 0.0f;
+#endif
+}
+
+static __device__ __forceinline__ float shfl_down(float r, int offset,
+                                                  int bound = 32,
+                                                  int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  return __shfl_down_sync(mask, r, offset, bound);
+#else
+  return __shfl_down(r, offset, bound);
+#endif
+#else
+  return 0.0f;
+#endif
+}
+
+static __device__ __forceinline__ double shfl_down(double r, int offset,
+                                                   int bound = 32,
+                                                   int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_down_sync(mask, a.x, offset, bound);
+  a.y = __shfl_down_sync(mask, a.y, offset, bound);
+  return *reinterpret_cast<double *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_down(a.x, offset, bound);
+  a.y = __shfl_down(a.y, offset, bound);
+  return *reinterpret_cast<double *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+static __device__ __forceinline__ long long shfl_down(long long r, int offset,
+                                                      int bound = 32,
+                                                      int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_down_sync(mask, a.x, offset, bound);
+  a.y = __shfl_down_sync(mask, a.y, offset, bound);
+  return *reinterpret_cast<long long *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_down(a.x, offset, bound);
+  a.y = __shfl_down(a.y, offset, bound);
+  return *reinterpret_cast<long long *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+// specifically for triangles counting
+static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset,
+                                                     int bound = 32,
+                                                     int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_down_sync(mask, a.x, offset, bound);
+  a.y = __shfl_down_sync(mask, a.y, offset, bound);
+  return *reinterpret_cast<uint64_t *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_down(mask, a.x, offset, bound);
+  a.y = __shfl_down(mask, a.y, offset, bound);
+  return *reinterpret_cast<uint64_t *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+static __device__ __forceinline__ int shfl_up(int r, int offset, int bound = 32,
+                                              int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  return __shfl_up_sync(mask, r, offset, bound);
+#else
+  return __shfl_up(r, offset, bound);
+#endif
+#else
+  return 0.0f;
+#endif
+}
+
+static __device__ __forceinline__ float shfl_up(float r, int offset,
+                                                int bound = 32,
+                                                int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  return __shfl_up_sync(mask, r, offset, bound);
+#else
+  return __shfl_up(r, offset, bound);
+#endif
+#else
+  return 0.0f;
+#endif
+}
+
+static __device__ __forceinline__ double shfl_up(double r, int offset,
+                                                 int bound = 32,
+                                                 int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_up_sync(mask, a.x, offset, bound);
+  a.y = __shfl_up_sync(mask, a.y, offset, bound);
+  return *reinterpret_cast<double *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_up(a.x, offset, bound);
+  a.y = __shfl_up(a.y, offset, bound);
+  return *reinterpret_cast<double *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+static __device__ __forceinline__ long long shfl_up(long long r, int offset,
+                                                    int bound = 32,
+                                                    int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_up_sync(mask, a.x, offset, bound);
+  a.y = __shfl_up_sync(mask, a.y, offset, bound);
+  return *reinterpret_cast<long long *>(&a);
+#else
+  int2 a = *reinterpret_cast<int2 *>(&r);
+  a.x = __shfl_up(a.x, offset, bound);
+  a.y = __shfl_up(a.y, offset, bound);
+  return *reinterpret_cast<long long *>(&a);
+#endif
+#else
+  return 0.0;
+#endif
+}
+
+static __inline__ __device__ double atomicFPAdd(double *addr, double val) {
+// atomicAdd for double starts with sm_60
+#if __CUDA_ARCH__ >= 600
+  return atomicAdd(addr, val);
+#else
+  unsigned long long old = __double_as_longlong(addr[0]), assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS((unsigned long long *)addr, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return old;
+#endif
+}
+
+// atomicAdd for float starts with sm_20
+static __inline__ __device__ float atomicFPAdd(float *addr, float val) {
+  return atomicAdd(addr, val);
+}
+
+static __inline__ __device__ double atomicFPMin(double *addr, double val) {
+  double old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = __longlong_as_double(
+      atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                __double_as_longlong(min(val, assumed))));
+  } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+  return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPMin(float *addr, float val) {
+  float old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),
+                                 float_as_int(min(val, assumed))));
+  } while (float_as_int(assumed) != float_as_int(old));
+
+  return old;
+}
+
+static __inline__ __device__ double atomicFPMax(double *addr, double val) {
+  double old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = __longlong_as_double(
+      atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                __double_as_longlong(max(val, assumed))));
+  } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+  return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPMax(float *addr, float val) {
+  float old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),
+                                 float_as_int(max(val, assumed))));
+  } while (float_as_int(assumed) != float_as_int(old));
+
+  return old;
+}
+
+static __inline__ __device__ double atomicFPOr(double *addr, double val) {
+  double old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = __longlong_as_double(
+      atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                __double_as_longlong((bool)val | (bool)assumed)));
+  } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+  return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPOr(float *addr, float val) {
+  float old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),
+                                 float_as_int((bool)val | (bool)assumed)));
+  } while (float_as_int(assumed) != float_as_int(old));
+
+  return old;
+}
+
+static __inline__ __device__ double atomicFPLog(double *addr, double val) {
+  double old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old = __longlong_as_double(
+      atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+                __double_as_longlong(-log(exp(-val) + exp(-assumed)))));
+  } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+  return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPLog(float *addr, float val) {
+  float old, assumed;
+  old = *addr;
+  do {
+    assumed = old;
+    old =
+      int_as_float(atomicCAS((int *)addr, float_as_int(assumed),
+                             float_as_int(-logf(expf(-val) + expf(-assumed)))));
+  } while (float_as_int(assumed) != float_as_int(old));
+
+  return old;
+}
+
+// Apply diagonal matrix to vector:
+//
+template <typename IndexType_, typename ValueType_>
+static __global__ void diagmv(IndexType_ n, ValueType_ alpha,
+                              const ValueType_ *__restrict__ D,
+                              const ValueType_ *__restrict__ x,
+                              ValueType_ *__restrict__ y) {
+  IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x;
+  while (i < n) {
+    y[i] += alpha * D[i] * x[i];
+    i += blockDim.x * gridDim.x;
+  }
+}
+
+}  // namespace utils
+
+}  // namespace raft
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
new file mode 100644
index 0000000000..4ff6cdf5fa
--- /dev/null
+++ b/cpp/test/cluster_solvers.cu
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/handle.hpp>
+
+#include <raft/spectral/modularity_maximization.hpp>
+
+namespace raft {
+
+TEST(Raft, ClusterSolvers) {
+  using namespace matrix;
+  using index_type = int;
+  using value_type = double;
+
+  handle_t h;
+
+  index_type maxiter{100};
+  value_type tol{1.0e-10};
+  unsigned long long seed{100110021003};
+
+  auto stream = h.get_stream();
+
+  index_type n{100};
+  index_type d{10};
+  index_type k{5};
+
+  //nullptr expected to trigger exceptions:
+  //
+  value_type* eigvecs{nullptr};
+  index_type* codes{nullptr};
+
+  cluster_solver_config_t<index_type, value_type> cfg{k, maxiter, tol, seed};
+
+  kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
+
+  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d,
+                                        eigvecs, codes));
+}
+
+TEST(Raft, ModularitySolvers) {
+  using namespace matrix;
+  using index_type = int;
+  using value_type = double;
+
+  handle_t h;
+  ASSERT_EQ(0, h.get_num_internal_streams());
+  ASSERT_EQ(0, h.get_device());
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  //nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
+                                                            seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  auto stream = h.get_stream();
+  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
+                                             nullptr, 0,       0};
+  auto t_exe_p = thrust::cuda::par.on(stream);
+
+  EXPECT_ANY_THROW(spectral::modularity_maximization(
+    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type modularity{0};
+  EXPECT_ANY_THROW(
+    spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
+}
+
+}  // namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
new file mode 100644
index 0000000000..e6ee09262e
--- /dev/null
+++ b/cpp/test/eigen_solvers.cu
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/handle.hpp>
+
+#include <raft/spectral/partition.hpp>
+
+namespace raft {
+
+TEST(Raft, EigenSolvers) {
+  using namespace matrix;
+  using index_type = int;
+  using value_type = double;
+
+  handle_t h;
+  ASSERT_EQ(0, h.get_num_internal_streams());
+  ASSERT_EQ(0, h.get_device());
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz = 0;
+  index_type nrows = 0;
+  auto stream = h.get_stream();
+  auto t_exe_pol = thrust::cuda::par.on(stream);
+
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  //nullptr expected to trigger exceptions:
+  //
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+
+  lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  EXPECT_ANY_THROW(
+    eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+
+  EXPECT_ANY_THROW(
+    eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+}
+
+TEST(Raft, SpectralSolvers) {
+  using namespace matrix;
+  using index_type = int;
+  using value_type = double;
+
+  handle_t h;
+  ASSERT_EQ(0, h.get_num_internal_streams());
+  ASSERT_EQ(0, h.get_device());
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  //nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
+                                                            seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  auto stream = h.get_stream();
+
+  auto t_exe_p = thrust::cuda::par.on(stream);
+  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
+                                             nullptr, 0,       0};
+  EXPECT_ANY_THROW(spectral::partition(
+    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type edgeCut{0};
+  value_type cost{0};
+  EXPECT_ANY_THROW(
+    spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
+}
+
+}  // namespace raft
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
new file mode 100644
index 0000000000..e5c2d52764
--- /dev/null
+++ b/cpp/test/spectral_matrix.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/handle.hpp>
+
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace raft {
+namespace {
+template <typename index_type, typename value_type>
+struct csr_view_t {
+  index_type* offsets;
+  index_type* indices;
+  value_type* edge_data;
+  index_type number_of_vertices;
+  index_type number_of_edges;
+};
+}  // namespace
+TEST(Raft, SpectralMatrices) {
+  using namespace matrix;
+  using index_type = int;
+  using value_type = double;
+
+  handle_t h;
+  ASSERT_EQ(0, h.get_num_internal_streams());
+  ASSERT_EQ(0, h.get_device());
+
+  csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
+
+  int const sz = 10;
+  vector_t<index_type> d_v{h, sz};
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz = 0;
+  index_type nrows = 0;
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+  ASSERT_EQ(nullptr, sm2.row_offsets_);
+
+  auto stream = h.get_stream();
+  auto t_exe_pol = thrust::cuda::par.on(stream);
+
+  auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
+    laplacian_matrix_t<index_type, value_type> lm1{h,  t_exe_pol, ro, ci,
+                                                   vs, nrows,     nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
+
+  auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) {
+    laplacian_matrix_t<index_type, value_type> lm2{h, t_exe_pol, sm2};
+  };
+  EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
+
+  auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
+    modularity_matrix_t<index_type, value_type> mm1{h,  t_exe_pol, ro, ci,
+                                                    vs, nrows,     nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
+
+  auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) {
+    modularity_matrix_t<index_type, value_type> mm2{h, t_exe_pol, sm2};
+  };
+  EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
+}
+
+}  // namespace raft