From 5dfe8d5ab2d8db51cf81b1f681cc35aeed9f93a1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Feb 2022 16:09:49 -0500
Subject: [PATCH 1/8] Hiding impl details for lap, spectral, and label

---
 cpp/include/raft/cluster/detail/kmeans.hpp    | 986 ++++++++++++++++++
 cpp/include/raft/cluster/kmeans.hpp           |  66 ++
 cpp/include/raft/label/classlabels.hpp        | 115 ++
 .../raft/label/{ => detail}/classlabels.cuh   |   3 +
 .../raft/label/{ => detail}/merge_labels.cuh  |   0
 cpp/include/raft/label/merge_labels.hpp       |  68 ++
 cpp/include/raft/lap/{ => detail}/d_structs.h |   0
 .../raft/lap/{ => detail}/lap_functions.cuh   |   0
 .../raft/lap/{ => detail}/lap_kernels.cuh     |   0
 cpp/include/raft/lap/{lap.cuh => lap.hpp}     |   4 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |   1 -
 cpp/include/raft/spectral/cluster_solvers.hpp |   8 +-
 .../raft/spectral/{ => detail}/lapack.hpp     |   0
 .../spectral/{ => detail}/matrix_wrappers.hpp |   0
 .../detail/modularity_maximization.hpp        | 188 ++++
 .../raft/spectral/detail/partition.hpp        | 180 ++++
 .../spectral/{ => detail}/spectral_util.hpp   |   0
 .../raft/spectral/{ => detail}/warn_dbg.hpp   |   0
 cpp/include/raft/spectral/eigen_solvers.hpp   |   3 +
 cpp/include/raft/spectral/kmeans.hpp          | 986 ------------------
 .../raft/spectral/modularity_maximization.hpp | 127 +--
 cpp/include/raft/spectral/partition.hpp       | 101 +-
 22 files changed, 1637 insertions(+), 1199 deletions(-)
 create mode 100644 cpp/include/raft/cluster/detail/kmeans.hpp
 create mode 100644 cpp/include/raft/cluster/kmeans.hpp
 create mode 100644 cpp/include/raft/label/classlabels.hpp
 rename cpp/include/raft/label/{ => detail}/classlabels.cuh (99%)
 rename cpp/include/raft/label/{ => detail}/merge_labels.cuh (100%)
 create mode 100644 cpp/include/raft/label/merge_labels.hpp
 rename cpp/include/raft/lap/{ => detail}/d_structs.h (100%)
 rename cpp/include/raft/lap/{ => detail}/lap_functions.cuh (100%)
 rename cpp/include/raft/lap/{ => detail}/lap_kernels.cuh (100%)
 rename cpp/include/raft/lap/{lap.cuh => lap.hpp} (99%)
 rename cpp/include/raft/spectral/{ => detail}/lapack.hpp (100%)
 rename cpp/include/raft/spectral/{ => detail}/matrix_wrappers.hpp (100%)
 create mode 100644 cpp/include/raft/spectral/detail/modularity_maximization.hpp
 create mode 100644 cpp/include/raft/spectral/detail/partition.hpp
 rename cpp/include/raft/spectral/{ => detail}/spectral_util.hpp (100%)
 rename cpp/include/raft/spectral/{ => detail}/warn_dbg.hpp (100%)
 delete mode 100644 cpp/include/raft/spectral/kmeans.hpp
diff --git a/cpp/include/raft/cluster/detail/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.hpp
new file mode 100644
index 0000000000..cf7255c22e
--- /dev/null
+++ b/cpp/include/raft/cluster/detail/kmeans.hpp
@@ -0,0 +1,986 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cmath>
+#include <cstdio>
+#include <ctime>
+
+#include <cuda.h>
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/random.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <raft/cudart_utils.h>
+#include <raft/device_atomics.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/warn_dbg.hpp>
+
+namespace raft {
+namespace cluster {
+namespace detail {
+// =========================================================
+// Useful grid settings
+// =========================================================
+
+    constexpr unsigned int BLOCK_SIZE      = 1024;
+    constexpr unsigned int WARP_SIZE       = 32;
+    constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
+
+// =========================================================
+// CUDA kernels
+// =========================================================
+
+/**
+ *  @brief Compute distances between observation vectors and centroids
+ *    Block dimensions should be (warpSize, 1,
+ *    blockSize/warpSize). Ideally, the grid is large enough so there
+ *    are d threads in the x-direction, k threads in the y-direction,
+ *    and n threads in the z-direction.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, d*n entries) Observation matrix. Matrix is
+ *    stored column-major and each column is an observation
+ *    vector. Matrix dimensions are d x n.
+ *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
+ *    stored column-major and each column is a centroid. Matrix
+ *    dimensions are d x k.
+ *  @param dists (Output, n*k entries) Distance matrix. Matrix is
+ *    stored column-major and the (i,j)-entry is the square of the
+ *    Euclidean distance between the ith observation vector and jth
+ *    centroid. Matrix dimensions are n x k. Entries must be
+ *    initialized to zero.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static __global__ void computeDistances(index_type_t n,
+                                            index_type_t d,
+                                            index_type_t k,
+                                            const value_type_t* __restrict__ obs,
+                                            const value_type_t* __restrict__ centroids,
+                                            value_type_t* __restrict__ dists)
+    {
+        // Loop index
+        index_type_t i;
+
+        // Block indices
+        index_type_t bidx;
+        // Global indices
+        index_type_t gidx, gidy, gidz;
+
+        // Private memory
+        value_type_t centroid_private, dist_private;
+
+        // Global x-index indicates index of vector entry
+        bidx = blockIdx.x;
+        while (bidx * blockDim.x < d) {
+            gidx = threadIdx.x + bidx * blockDim.x;
+
+            // Global y-index indicates centroid
+            gidy = threadIdx.y + blockIdx.y * blockDim.y;
+            while (gidy < k) {
+                // Load centroid coordinate from global memory
+                centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0;
+
+                // Global z-index indicates observation vector
+                gidz = threadIdx.z + blockIdx.z * blockDim.z;
+                while (gidz < n) {
+                    // Load observation vector coordinate from global memory
+                    dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0;
+
+                    // Compute contribution of current entry to distance
+                    dist_private = centroid_private - dist_private;
+                    dist_private = dist_private * dist_private;
+
+                    // Perform reduction on warp
+                    for (i = WARP_SIZE / 2; i > 0; i /= 2)
+                        dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
+
+                    // Write result to global memory
+                    if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
+
+                    // Move to another observation vector
+                    gidz += blockDim.z * gridDim.z;
+                }
+
+                // Move to another centroid
+                gidy += blockDim.y * gridDim.y;
+            }
+
+            // Move to another vector entry
+            bidx += gridDim.x;
+        }
+    }
+
+/**
+ *  @brief Find closest centroid to observation vectors.
+ *    Block and grid dimensions should be 1-dimensional. Ideally the
+ *    grid is large enough so there are n threads.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Number of observation vectors.
+ *  @param k Number of clusters.
+ *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
+ *    stored column-major and each column is a centroid. Matrix
+ *    dimensions are d x k.
+ *  @param dists (Input/output, n*k entries) Distance matrix. Matrix
+ *    is stored column-major and the (i,j)-entry is the square of
+ *    the Euclidean distance between the ith observation vector and
+ *    jth centroid. Matrix dimensions are n x k. On exit, the first
+ *    n entries give the square of the Euclidean distance between
+ *    observation vectors and closest centroids.
+ *  @param codes (Output, n entries) Cluster assignments.
+ *  @param clusterSizes (Output, k entries) Number of points in each
+ *    cluster. Entries must be initialized to zero.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static __global__ void minDistances(index_type_t n,
+                                        index_type_t k,
+                                        value_type_t* __restrict__ dists,
+                                        index_type_t* __restrict__ codes,
+                                        index_type_t* __restrict__ clusterSizes)
+    {
+        // Loop index
+        index_type_t i, j;
+
+        // Current matrix entry
+        value_type_t dist_curr;
+
+        // Smallest entry in row
+        value_type_t dist_min;
+        index_type_t code_min;
+
+        // Each row in observation matrix is processed by a thread
+        i = threadIdx.x + blockIdx.x * blockDim.x;
+        while (i < n) {
+            // Find minimum entry in row
+            code_min = 0;
+            dist_min = dists[IDX(i, 0, n)];
+            for (j = 1; j < k; ++j) {
+                dist_curr = dists[IDX(i, j, n)];
+                code_min  = (dist_curr < dist_min) ? j : code_min;
+                dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
+            }
+
+            // Transfer result to global memory
+            dists[i] = dist_min;
+            codes[i] = code_min;
+
+            // Increment cluster sizes
+            atomicAdd(clusterSizes + code_min, 1);
+
+            // Move to another row
+            i += blockDim.x * gridDim.x;
+        }
+    }
+
+/**
+ *  @brief Check if newly computed distances are smaller than old distances.
+ *    Block and grid dimensions should be 1-dimensional. Ideally the
+ *    grid is large enough so there are n threads.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Number of observation vectors.
+ *  @param dists_old (Input/output, n entries) Distances between
+ *    observation vectors and closest centroids. On exit, entries
+ *    are replaced by entries in 'dists_new' if the corresponding
+ *    observation vectors are closest to the new centroid.
+ *  @param dists_new (Input, n entries) Distance between observation
+ *    vectors and new centroid.
+ *  @param codes_old (Input/output, n entries) Cluster
+ *    assignments. On exit, entries are replaced with 'code_new' if
+ *    the corresponding observation vectors are closest to the new
+ *    centroid.
+ *  @param code_new Index associated with new centroid.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static __global__ void minDistances2(index_type_t n,
+                                         value_type_t* __restrict__ dists_old,
+                                         const value_type_t* __restrict__ dists_new,
+                                         index_type_t* __restrict__ codes_old,
+                                         index_type_t code_new)
+    {
+        // Loop index
+        index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
+
+        // Distances
+        value_type_t dist_old_private;
+        value_type_t dist_new_private;
+
+        // Each row is processed by a thread
+        while (i < n) {
+            // Get old and new distances
+            dist_old_private = dists_old[i];
+            dist_new_private = dists_new[i];
+
+            // Update if new distance is smaller than old distance
+            if (dist_new_private < dist_old_private) {
+                dists_old[i] = dist_new_private;
+                codes_old[i] = code_new;
+            }
+
+            // Move to another row
+            i += blockDim.x * gridDim.x;
+        }
+    }
+
+/**
+ *  @brief Compute size of k-means clusters.
+ *    Block and grid dimensions should be 1-dimensional. Ideally the
+ *    grid is large enough so there are n threads.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @param n Number of observation vectors.
+ *  @param k Number of clusters.
+ *  @param codes (Input, n entries) Cluster assignments.
+ *  @param clusterSizes (Output, k entries) Number of points in each
+ *    cluster. Entries must be initialized to zero.
+ */
+    template <typename index_type_t>
+    static __global__ void computeClusterSizes(index_type_t n,
+                                               const index_type_t* __restrict__ codes,
+                                               index_type_t* __restrict__ clusterSizes)
+    {
+        index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
+        while (i < n) {
+            atomicAdd(clusterSizes + codes[i], 1);
+            i += blockDim.x * gridDim.x;
+        }
+    }
+
+/**
+ *  @brief Divide rows of centroid matrix by cluster sizes.
+ *    Divides the ith column of the sum matrix by the size of the ith
+ *    cluster. If the sum matrix has been initialized so that the ith
+ *    row is the sum of all observation vectors in the ith cluster,
+ *    this kernel produces cluster centroids. The grid and block
+ *    dimensions should be 2-dimensional. Ideally the grid is large
+ *    enough so there are d threads in the x-direction and k threads
+ *    in the y-direction.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param clusterSizes (Input, k entries) Number of points in each
+ *    cluster.
+ *  @param centroids (Input/output, d*k entries) Sum matrix. Matrix
+ *    is stored column-major and matrix dimensions are d x k. The
+ *    ith column is the sum of all observation vectors in the ith
+ *    cluster. On exit, the matrix is the centroid matrix (each
+ *    column is the mean position of a cluster).
+ */
+    template <typename index_type_t, typename value_type_t>
+    static __global__ void divideCentroids(index_type_t d,
+                                           index_type_t k,
+                                           const index_type_t* __restrict__ clusterSizes,
+                                           value_type_t* __restrict__ centroids)
+    {
+        // Global indices
+        index_type_t gidx, gidy;
+
+        // Current cluster size
+        index_type_t clusterSize_private;
+
+        // Observation vector is determined by global y-index
+        gidy = threadIdx.y + blockIdx.y * blockDim.y;
+        while (gidy < k) {
+            // Get cluster size from global memory
+            clusterSize_private = clusterSizes[gidy];
+
+            // Add vector entries to centroid matrix
+            //   vector entris are determined by global x-index
+            gidx = threadIdx.x + blockIdx.x * blockDim.x;
+            while (gidx < d) {
+                centroids[IDX(gidx, gidy, d)] /= clusterSize_private;
+                gidx += blockDim.x * gridDim.x;
+            }
+
+            // Move to another centroid
+            gidy += blockDim.y * gridDim.y;
+        }
+    }
+
+// =========================================================
+// Helper functions
+// =========================================================
+
+/**
+ *  @brief Randomly choose new centroids.
+ *    Centroid is randomly chosen with k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param rand Random number drawn uniformly from [0,1).
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are n x d.
+ *  @param dists (Input, device memory, 2*n entries) Workspace. The
+ *    first n entries should be the distance between observation
+ *    vectors and the closest centroid.
+ *  @param centroid (Output, device memory, d entries) Centroid
+ *    coordinates.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static int chooseNewCentroid(handle_t const& handle,
+                                 index_type_t n,
+                                 index_type_t d,
+                                 value_type_t rand,
+                                 const value_type_t* __restrict__ obs,
+                                 value_type_t* __restrict__ dists,
+                                 value_type_t* __restrict__ centroid)
+    {
+        // Cumulative sum of distances
+        value_type_t* distsCumSum = dists + n;
+        // Residual sum of squares
+        value_type_t distsSum{0};
+        // Observation vector that is chosen as new centroid
+        index_type_t obsIndex;
+
+        auto stream             = handle.get_stream();
+        auto thrust_exec_policy = handle.get_thrust_policy();
+
+        // Compute cumulative sum of distances
+        thrust::inclusive_scan(thrust_exec_policy,
+                               thrust::device_pointer_cast(dists),
+                               thrust::device_pointer_cast(dists + n),
+                               thrust::device_pointer_cast(distsCumSum));
+        CHECK_CUDA(stream);
+        CUDA_TRY(cudaMemcpyAsync(
+                &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
+
+        // Randomly choose observation vector
+        //   Probabilities are proportional to square of distance to closest
+        //   centroid (see k-means++ algorithm)
+        //
+        // seg-faults due to Thrust bug
+        // on binary-search-like algorithms
+        // when run with stream dependent
+        // execution policies; fixed on Thrust GitHub
+        // hence replace w/ linear interpolation,
+        // until the Thrust issue gets resolved:
+        //
+        // obsIndex = (thrust::lower_bound(
+        //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
+        //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
+        //             thrust::device_pointer_cast(distsCumSum));
+        //
+        // linear interpolation logic:
+        //{
+        value_type_t minSum{0};
+        RAFT_CUDA_TRY(
+                cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
+        RAFT_CHECK_CUDA(stream);
+
+        if (distsSum > minSum) {
+            value_type_t vIndex = static_cast<value_type_t>(n - 1);
+            obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
+        } else {
+            obsIndex = 0;
+        }
+        //}
+
+        RAFT_CHECK_CUDA(stream);
+        obsIndex = max(obsIndex, 0);
+        obsIndex = min(obsIndex, n - 1);
+
+        // Record new centroid position
+        RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
+                                      obs + IDX(0, obsIndex, d),
+                                      d * sizeof(value_type_t),
+                                      cudaMemcpyDeviceToDevice,
+                                      stream));
+
+        return 0;
+    }
+
+/**
+ *  @brief Choose initial cluster centroids for k-means algorithm.
+ *    Centroids are randomly chosen with k-means++ algorithm
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param centroids (Output, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Output, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param dists (Output, device memory, 2*n entries) Workspace. On
+ *    exit, the first n entries give the square of the Euclidean
+ *    distance between observation vectors and the closest centroid.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static int initializeCentroids(handle_t const& handle,
+                                   index_type_t n,
+                                   index_type_t d,
+                                   index_type_t k,
+                                   const value_type_t* __restrict__ obs,
+                                   value_type_t* __restrict__ centroids,
+                                   index_type_t* __restrict__ codes,
+                                   index_type_t* __restrict__ clusterSizes,
+                                   value_type_t* __restrict__ dists,
+                                   unsigned long long seed)
+    {
+        // -------------------------------------------------------
+        // Variable declarations
+        // -------------------------------------------------------
+
+        // Loop index
+        index_type_t i;
+
+        // Random number generator
+        thrust::default_random_engine rng(seed);
+        thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
+
+        auto stream             = handle.get_stream();
+        auto thrust_exec_policy = handle.get_thrust_policy();
+
+        constexpr index_type_t grid_lower_bound{65535};
+
+        // -------------------------------------------------------
+        // Implementation
+        // -------------------------------------------------------
+
+        // Initialize grid dimensions
+        dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
+
+        // CUDA grid dimensions
+        dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                          1,
+                          min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+
+        // CUDA grid dimensions
+        dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
+
+        // Assign observation vectors to code 0
+        RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+
+        // Choose first centroid
+        thrust::fill(thrust_exec_policy,
+                     thrust::device_pointer_cast(dists),
+                     thrust::device_pointer_cast(dists + n),
+                     1);
+        RAFT_CHECK_CUDA(stream);
+        if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
+            WARNING("error in k-means++ (could not pick centroid)");
+
+        // Compute distances from first centroid
+        RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
+        computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
+        RAFT_CHECK_CUDA(stream);
+
+        // Choose remaining centroids
+        for (i = 1; i < k; ++i) {
+            // Choose ith centroid
+            if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d)))
+                WARNING("error in k-means++ (could not pick centroid)");
+
+            // Compute distances from ith centroid
+            CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
+            computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
+                    n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
+            RAFT_CHECK_CUDA(stream);
+
+            // Recompute minimum distances
+            minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
+            RAFT_CHECK_CUDA(stream);
+        }
+
+        // Compute cluster sizes
+        CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+        computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
+        RAFT_CHECK_CUDA(stream);
+
+        return 0;
+    }
+
+/**
+ *  @brief Find cluster centroids closest to observation vectors.
+ *    Distance is measured with Euclidean norm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param centroids (Input, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param dists (Output, device memory, n*k entries) Workspace. On
+ *    exit, the first n entries give the square of the Euclidean
+ *    distance between observation vectors and the closest centroid.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Output, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param residual_host (Output, host memory, 1 entry) Residual sum
+ *    of squares of assignment.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static int assignCentroids(handle_t const& handle,
+                               index_type_t n,
+                               index_type_t d,
+                               index_type_t k,
+                               const value_type_t* __restrict__ obs,
+                               const value_type_t* __restrict__ centroids,
+                               value_type_t* __restrict__ dists,
+                               index_type_t* __restrict__ codes,
+                               index_type_t* __restrict__ clusterSizes,
+                               value_type_t* residual_host)
+    {
+        auto stream             = handle.get_stream();
+        auto thrust_exec_policy = handle.get_thrust_policy();
+
+        // Compute distance between centroids and observation vectors
+        RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
+
+        // CUDA grid dimensions
+        dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
+
+        dim3 gridDim;
+        constexpr index_type_t grid_lower_bound{65535};
+        gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
+        gridDim.y = min(k, grid_lower_bound);
+        gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+
+        computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
+        RAFT_CHECK_CUDA(stream);
+
+        // Find centroid closest to each observation vector
+        CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+        blockDim.x = BLOCK_SIZE;
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+        gridDim.y  = 1;
+        gridDim.z  = 1;
+        minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
+        CHECK_CUDA(stream);
+
+        // Compute residual sum of squares
+        *residual_host = thrust::reduce(
+                thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
+
+        return 0;
+    }
+
+/**
+ *  @brief Update cluster centroids for k-means algorithm.
+ *    All clusters are assumed to be non-empty.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Input, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Input, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param centroids (Output, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param work (Output, device memory, n*d entries) Workspace.
+ *  @param work_int (Output, device memory, 2*d*n entries)
+ *    Workspace.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+    template <typename index_type_t, typename value_type_t>
+    static int updateCentroids(handle_t const& handle,
+                               index_type_t n,
+                               index_type_t d,
+                               index_type_t k,
+                               const value_type_t* __restrict__ obs,
+                               const index_type_t* __restrict__ codes,
+                               const index_type_t* __restrict__ clusterSizes,
+                               value_type_t* __restrict__ centroids,
+                               value_type_t* __restrict__ work,
+                               index_type_t* __restrict__ work_int)
+    {
+        // -------------------------------------------------------
+        // Variable declarations
+        // -------------------------------------------------------
+
+        // Useful constants
+        const value_type_t one  = 1;
+        const value_type_t zero = 0;
+
+        constexpr index_type_t grid_lower_bound{65535};
+
+        auto stream             = handle.get_stream();
+        auto cublas_h           = handle.get_cublas_handle();
+        auto thrust_exec_policy = handle.get_thrust_policy();
+
+        // Device memory
+        thrust::device_ptr<value_type_t> obs_copy(work);
+        thrust::device_ptr<index_type_t> codes_copy(work_int);
+        thrust::device_ptr<index_type_t> rows(work_int + d * n);
+
+        // Take transpose of observation matrix
+        RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                                   CUBLAS_OP_T,
+                                   CUBLAS_OP_N,
+                                   n,
+                                   d,
+                                   &one,
+                                   obs,
+                                   d,
+                                   &zero,
+                                   (value_type_t*)NULL,
+                                   n,
+                                   thrust::raw_pointer_cast(obs_copy),
+                                   n,
+                                   stream));
+
+        // Cluster assigned to each observation matrix entry
+        thrust::sequence(thrust_exec_policy, rows, rows + d * n);
+        RAFT_CHECK_CUDA(stream);
+        thrust::transform(thrust_exec_policy,
+                          rows,
+                          rows + d * n,
+                          thrust::make_constant_iterator<index_type_t>(n),
+                          rows,
+                          thrust::modulus<index_type_t>());
+        RAFT_CHECK_CUDA(stream);
+        thrust::gather(
+                thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
+        RAFT_CHECK_CUDA(stream);
+
+        // Row associated with each observation matrix entry
+        thrust::sequence(thrust_exec_policy, rows, rows + d * n);
+        RAFT_CHECK_CUDA(stream);
+        thrust::transform(thrust_exec_policy,
+                          rows,
+                          rows + d * n,
+                          thrust::make_constant_iterator<index_type_t>(n),
+                          rows,
+                          thrust::divides<index_type_t>());
+        RAFT_CHECK_CUDA(stream);
+
+        // Sort and reduce to add observation vectors in same cluster
+        thrust::stable_sort_by_key(thrust_exec_policy,
+                                   codes_copy,
+                                   codes_copy + d * n,
+                                   make_zip_iterator(make_tuple(obs_copy, rows)));
+        RAFT_CHECK_CUDA(stream);
+        thrust::reduce_by_key(thrust_exec_policy,
+                              rows,
+                              rows + d * n,
+                              obs_copy,
+                              codes_copy,  // Output to codes_copy is ignored
+                              thrust::device_pointer_cast(centroids));
+        RAFT_CHECK_CUDA(stream);
+
+        // Divide sums by cluster size to get centroid matrix
+        //
+        // CUDA grid dimensions
+        dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
+
+        // CUDA grid dimensions
+        dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                     min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+                     1};
+
+        divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
+        RAFT_CHECK_CUDA(stream);
+
+        return 0;
+    }
+
+}  // namespace
+
+namespace raft {
+
+// =========================================================
+// k-means algorithm
+// =========================================================
+
+/**
+ *  @brief Find clusters with k-means algorithm.
+ *    Initial centroids are chosen with k-means++ algorithm. Empty
+ *    clusters are reinitialized by choosing new centroids with
+ *    k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param tol Tolerance for convergence. k-means stops when the
+ *    change in residual divided by n is less than tol.
+ *  @param maxiter Maximum number of k-means iterations.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param clusterSizes (Output, device memory, k entries) Number of
+ *    points in each cluster.
+ *  @param centroids (Output, device memory, d*k entries) Centroid
+ *    matrix. Matrix is stored column-major and each column is a
+ *    centroid. Matrix dimensions are d x k.
+ *  @param work (Output, device memory, n*max(k,d) entries)
+ *    Workspace.
+ *  @param work_int (Output, device memory, 2*d*n entries)
+ *    Workspace.
+ *  @param residual_host (Output, host memory, 1 entry) Residual sum
+ *    of squares (sum of squares of distances between observation
+ *    vectors and centroids).
+ *  @param iters_host (Output, host memory, 1 entry) Number of
+ *    k-means iterations.
+ *  @param seed random seed to be used.
+ *  @return error flag.
+ */
+        template<typename index_type_t, typename value_type_t>
+        int kmeans(handle_t const &handle,
+                   index_type_t n,
+                   index_type_t d,
+                   index_type_t k,
+                   value_type_t tol,
+                   index_type_t maxiter,
+                   const value_type_t *__restrict__ obs,
+                   index_type_t *__restrict__ codes,
+                   index_type_t *__restrict__ clusterSizes,
+                   value_type_t *__restrict__ centroids,
+                   value_type_t *__restrict__ work,
+                   index_type_t *__restrict__ work_int,
+                   value_type_t *residual_host,
+                   index_type_t *iters_host,
+                   unsigned long long seed) {
+            // -------------------------------------------------------
+            // Variable declarations
+            // -------------------------------------------------------
+
+            // Current iteration
+            index_type_t iter;
+
+            constexpr
+            index_type_t grid_lower_bound{65535};
+
+            // Residual sum of squares at previous iteration
+            value_type_t residualPrev = 0;
+
+            // Random number generator
+            thrust::default_random_engine rng(seed);
+            thrust::uniform_real_distribution <value_type_t> uniformDist(0, 1);
+
+            // -------------------------------------------------------
+            // Initialization
+            // -------------------------------------------------------
+
+            auto stream = handle.get_stream();
+            auto cublas_h = handle.get_cublas_handle();
+            auto thrust_exec_policy = handle.get_thrust_policy();
+
+            // Trivial cases
+            if (k == 1) {
+                CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+                CUDA_TRY(
+                        cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
+                if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
+                    WARNING("could not compute k-means centroids");
+
+                dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
+
+                dim3 gridDim{
+                        min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                        1,
+                        min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
+
+                CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
+                computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
+                RAFT_CHECK_CUDA(stream);
+                *residual_host = thrust::reduce(
+                        thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
+                RAFT_CHECK_CUDA(stream);
+                return 0;
+            }
+            if (n <= k) {
+                thrust::sequence(thrust_exec_policy,
+                                 thrust::device_pointer_cast(codes),
+                                 thrust::device_pointer_cast(codes + n));
+                RAFT_CHECK_CUDA(stream);
+                thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
+                RAFT_CHECK_CUDA(stream);
+
+                if (n < k)
+                    RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
+                RAFT_CUDA_TRY(cudaMemcpyAsync(
+                        centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+                *residual_host = 0;
+                return 0;
+            }
+
+            // Initialize cuBLAS
+            RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+            // -------------------------------------------------------
+            // k-means++ algorithm
+            // -------------------------------------------------------
+
+            // Choose initial cluster centroids
+            if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
+                WARNING("could not initialize k-means centroids");
+
+            // Apply k-means iteration until convergence
+            for (iter = 0; iter < maxiter; ++iter) {
+                // Update cluster centroids
+                if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
+                    WARNING("could not update k-means centroids");
+
+                // Determine centroid closest to each observation
+                residualPrev = *residual_host;
+                if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
+                    WARNING("could not assign observation vectors to k-means clusters");
+
+                // Reinitialize empty clusters with new centroids
+                index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
+                                                           thrust::device_pointer_cast(clusterSizes),
+                                                           thrust::device_pointer_cast(clusterSizes + k),
+                                                           0) -
+                                              thrust::device_pointer_cast(clusterSizes));
+
+                // FIXME: emptyCentroid never reaches k (infinite loop) under certain
+                // conditions, such as if obs is corrupt (as seen as a result of a
+                // DataFrame column of NULL edge vals used to create the Graph)
+                while (emptyCentroid < k) {
+                    if (chooseNewCentroid(
+                            handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d)))
+                        WARNING("could not replace empty centroid");
+                    if (assignCentroids(
+                            handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
+                        WARNING("could not assign observation vectors to k-means clusters");
+                    emptyCentroid = (thrust::find(thrust_exec_policy,
+                                                  thrust::device_pointer_cast(clusterSizes),
+                                                  thrust::device_pointer_cast(clusterSizes + k),
+                                                  0) -
+                                     thrust::device_pointer_cast(clusterSizes));
+                    RAFT_CHECK_CUDA(stream);
+                }
+
+                // Check for convergence
+                if (std::fabs(residualPrev - (*residual_host)) / n < tol) {
+                    ++iter;
+                    break;
+                }
+            }
+
+            // Warning if k-means has failed to converge
+            if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
+
+            *iters_host = iter;
+            return 0;
+        }
+
+/**
+ *  @brief Find clusters with k-means algorithm.
+ *    Initial centroids are chosen with k-means++ algorithm. Empty
+ *    clusters are reinitialized by choosing new centroids with
+ *    k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param tol Tolerance for convergence. k-means stops when the
+ *    change in residual divided by n is less than tol.
+ *  @param maxiter Maximum number of k-means iterations.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param residual On exit, residual sum of squares (sum of squares
+ *    of distances between observation vectors and centroids).
+ *  @param iters on exit, number of k-means iterations.
+ *  @param seed random seed to be used.
+ *  @return error flag
+ */
+        template<typename index_type_t, typename value_type_t>
+        int kmeans(handle_t const &handle,
+                   index_type_t n,
+                   index_type_t d,
+                   index_type_t k,
+                   value_type_t tol,
+                   index_type_t maxiter,
+                   const value_type_t *__restrict__ obs,
+                   index_type_t *__restrict__ codes,
+                   value_type_t &residual,
+                   index_type_t &iters,
+                   unsigned long long seed = 123456) {
+            using namespace matrix;
+
+            // Check that parameters are valid
+            RAFT_EXPECTS(n > 0, "invalid parameter (n<1)");
+            RAFT_EXPECTS(d > 0, "invalid parameter (d<1)");
+            RAFT_EXPECTS(k > 0, "invalid parameter (k<1)");
+            RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)");
+            RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)");
+
+            // Allocate memory
+            vector_t <index_type_t> clusterSizes(handle, k);
+            vector_t <value_type_t> centroids(handle, d * k);
+            vector_t <value_type_t> work(handle, n * max(k, d));
+            vector_t <index_type_t> work_int(handle, 2 * d * n);
+
+            // Perform k-means
+            return kmeans<index_type_t, value_type_t>(handle,
+                                                      n,
+                                                      d,
+                                                      k,
+                                                      tol,
+                                                      maxiter,
+                                                      obs,
+                                                      codes,
+                                                      clusterSizes.raw(),
+                                                      centroids.raw(),
+                                                      work.raw(),
+                                                      work_int.raw(),
+                                                      &residual,
+                                                      &iters,
+                                                      seed);
+        }
+
+}  // namespace detail
+}  // namespace cluster
+}  // namespace raft
diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp
new file mode 100644
index 0000000000..fe4dd4793a
--- /dev/null
+++ b/cpp/include/raft/cluster/kmeans.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/cluster/detail/kmeans.hpp>
+
+namespace raft {
+namespace cluster {
+
+/**
+ *  @brief Find clusters with k-means algorithm.
+ *    Initial centroids are chosen with k-means++ algorithm. Empty
+ *    clusters are reinitialized by choosing new centroids with
+ *    k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param tol Tolerance for convergence. k-means stops when the
+ *    change in residual divided by n is less than tol.
+ *  @param maxiter Maximum number of k-means iterations.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param residual On exit, residual sum of squares (sum of squares
+ *    of distances between observation vectors and centroids).
+ *  @param iters on exit, number of k-means iterations.
+ *  @param seed random seed to be used.
+ *  @return error flag
+ */
+template<typename index_type_t, typename value_type_t>
+int kmeans(handle_t const &handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
+           const value_type_t *__restrict__ obs,
+           index_type_t *__restrict__ codes,
+           value_type_t &residual,
+           index_type_t &iters,
+           unsigned long long seed = 123456) {
+
+    return detail::kmeans<index_type_t, value_type_t>(handle, n, d, k, tol,
+                                                      maxiter, obs, codes,
+                                                      residual, iters, seed);
+}
+}  // namespace cluster
+}  // namespace raft
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
new file mode 100644
index 0000000000..a4d0cae359
--- /dev/null
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/label/detail/classlabels.cuh>
+
+namespace raft {
+    namespace label {
+
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * \tparam value_t numeric type of the arrays with class labels
+ * \param [in] y device array of labels, size [n]
+ * \param [in] n number of labels
+ * \param [out] unique device array of unique labels, unallocated on entry,
+ *   on exit it has size [n_unique]
+ * \param [out] n_unique number of unique labels
+ * \param [in] stream cuda stream
+ */
+template <typename value_t>
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream) {
+    detail::getUniqueLabels<value_t>(unique, y, n, stream);
+}
+
+/**
+ * Assign one versus rest labels.
+ *
+ * The output labels will have values +/-1:
+ * y_out = (y == y_unique[idx]) ? +1 : -1;
+ *
+ * The output type currently is set to value_t, but for SVM in principle we are
+ * free to choose other type for y_out (it should represent +/-1, and it is used
+ * in floating point arithmetics).
+ *
+ * \param [in] y device array if input labels, size [n]
+ * \param [in] n number of labels
+ * \param [in] y_unique device array of unique labels, size [n_classes]
+ * \param [in] n_classes number of unique labels
+ * \param [out] y_out device array of output labels
+ * \param [in] idx index of unique label that should be labeled as 1
+ * \param [in] stream cuda stream
+ */
+template <typename value_t>
+void getOvrlabels(
+        value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) {
+    detail::getOvrLabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
+}
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out the output monotonic array
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ * @param filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ */
+template <typename Type, typename Lambda>
+void make_monotonic(
+        Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+{
+    detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
+}
+
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out output label array with labels assigned monotonically
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ */
+template <typename Type>
+void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
+{
+    detail::make_monotonic<Type>(out, in, N, stream, zero_based);
+}
+};  // namespace label
+};  // end namespace raft
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
similarity index 99%
rename from cpp/include/raft/label/classlabels.cuh
rename to cpp/include/raft/label/detail/classlabels.cuh
index dce732b06b..b7582be1a3 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -26,6 +26,7 @@
 
 namespace raft {
 namespace label {
+namespace detail {
 
 /**
  * Get unique class labels.
@@ -194,5 +195,7 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer
   make_monotonic<Type>(
     out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
 }
+
+};  // namespace detail
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
similarity index 100%
rename from cpp/include/raft/label/merge_labels.cuh
rename to cpp/include/raft/label/detail/merge_labels.cuh
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
new file mode 100644
index 0000000000..b21502a06a
--- /dev/null
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/label/detail/merge_labels.cuh>
+
+namespace raft {
+namespace label {
+namespace detail {
+
+/**
+ * @brief Merge two labellings in-place, according to a core mask
+ *
+ * A labelling is a representation of disjoint sets (groups) where points that
+ * belong to the same group have the same label. It is assumed that group
+ * labels take values between 1 and N. labels relate to points, i.e a label i+1
+ * means that you belong to the same group as the point i.
+ * The special value MAX_LABEL is used to mark points that are not labelled.
+ *
+ * The two label arrays A and B induce two sets of groups over points 0..N-1.
+ * If a point is labelled i in A and j in B and the mask is true for this
+ * point, then i and j are equivalent labels and their groups are merged by
+ * relabeling the elements of both groups to have the same label. The new label
+ * is the smaller one from the original labels.
+ * It is required that if the mask is true for a point, this point is labelled
+ * (i.e its label is different than the special value MAX_LABEL).
+ *
+ * One use case is finding connected components: the two input label arrays can
+ * represent the connected components of graphs G_A and G_B, and the output
+ * would be the connected components labels of G_A \union G_B.
+ *
+ * @param[inout] labels_a    First input, and output label array (in-place)
+ * @param[in]    labels_b    Second input label array
+ * @param[in]    mask        Core point mask
+ * @param[out]   R           label equivalence map
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template<typename value_idx = int, int TPB_X = 256>
+void merge_labels(value_idx *labels_a,
+                  const value_idx *labels_b,
+                  const bool *mask,
+                  value_idx *R,
+                  bool *m,
+                  value_idx N,
+                  cudaStream_t stream) {
+
+    detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
+}
+
+};  // namespace detail
+};  // namespace label
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/detail/d_structs.h
similarity index 100%
rename from cpp/include/raft/lap/d_structs.h
rename to cpp/include/raft/lap/detail/d_structs.h
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh
similarity index 100%
rename from cpp/include/raft/lap/lap_functions.cuh
rename to cpp/include/raft/lap/detail/lap_functions.cuh
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh
similarity index 100%
rename from cpp/include/raft/lap/lap_kernels.cuh
rename to cpp/include/raft/lap/detail/lap_kernels.cuh
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.hpp
similarity index 99%
rename from cpp/include/raft/lap/lap.cuh
rename to cpp/include/raft/lap/lap.hpp
index 42b898ebff..2350ebcddf 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.hpp
@@ -27,8 +27,8 @@
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include "d_structs.h"
-#include "lap_functions.cuh"
+#include "detail/d_structs.h"
+#include "detail/lap_functions.cuh"
 
 namespace raft {
 namespace lap {
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index de62f25ffa..2519c1914e 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -17,7 +17,6 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 221a9679d4..092e51a17d 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -15,10 +15,11 @@
  */
 #pragma once
 
-#include <raft/spectral/kmeans.hpp>
+#include <raft/cluster/kmeans.hpp>
 #include <utility>  // for std::pair
 
 namespace raft {
+namespace spectral {
 
 using namespace matrix;
 
@@ -52,7 +53,8 @@ struct kmeans_solver_t {
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle,
+
+    raft::cluster::kmeans(handle,
            n_obs_vecs,
            dim,
            config_.n_clusters,
@@ -71,4 +73,6 @@ struct kmeans_solver_t {
  private:
   cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
 };
+
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/detail/lapack.hpp
similarity index 100%
rename from cpp/include/raft/spectral/lapack.hpp
rename to cpp/include/raft/spectral/detail/lapack.hpp
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
similarity index 100%
rename from cpp/include/raft/spectral/matrix_wrappers.hpp
rename to cpp/include/raft/spectral/detail/matrix_wrappers.hpp
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
new file mode 100644
index 0000000000..ec0e23dfe0
--- /dev/null
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/spectral_util.hpp>
+
+#ifdef COLLECT_TIME_STATISTICS
+#include <cuda_profiler_api.h>
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+#endif
+
+#ifdef COLLECT_TIME_STATISTICS
+static double timer(void)
+{
+  struct timeval tv;
+  cudaDeviceSynchronize();
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+#endif
+
+namespace raft {
+namespace spectral {
+namespace detail {
+
+using namespace matrix;
+using namespace linalg;
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  handle_t const& handle,
+  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // Compute eigenvectors of Modularity Matrix
+
+  // Initialize Modularity Matrix
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute eigenvectors corresponding to largest eigenvalues
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // notice that at this point the matrix has already been transposed, so we are scaling
+  // columns
+  scale_obs(nEigVecs, n, eigVecs);
+  RAFT_CHECK_CUDA(stream);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(handle_t const& handle,
+                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+  weight_t partModularity, clustersize;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // Device memory
+  vector_t<weight_t> part_i(handle, n);
+  vector_t<weight_t> Bx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Modularity
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  // Initialize output
+  modularity = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    modularity += partModularity;
+  }
+
+  modularity = modularity / B.diagonal_.nrm1();
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
new file mode 100644
index 0000000000..82313f84ee
--- /dev/null
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/detail/spectral_util.hpp>
+
+namespace raft {
+namespace spectral {
+namespace detail {
+
+using namespace matrix;
+using namespace linalg;
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
+                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                                                   EigenSolver const& eigen_solver,
+                                                   ClusterSolver const& cluster_solver,
+                                                   vertex_t* __restrict__ clusters,
+                                                   weight_t* eigVals,
+                                                   weight_t* eigVecs) {
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            // cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // -------------------------------------------------------
+  // Spectral partitioner
+  // -------------------------------------------------------
+
+  // Compute eigenvectors of Laplacian
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute smallest eigenvalues and eigenvectors
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(handle_t const& handle,
+                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost) {
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+
+  auto stream   = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+
+  weight_t partEdgesCut, clustersize;
+
+  // Device memory
+  vector_t<weight_t> part_i(handle, n);
+  vector_t<weight_t> Lx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  // Initialize output
+  cost    = 0;
+  edgeCut = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    // Construct indicator vector for ith partition
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    cost += partEdgesCut / clustersize;
+    edgeCut += partEdgesCut / 2;
+  }
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/detail/spectral_util.hpp
similarity index 100%
rename from cpp/include/raft/spectral/spectral_util.hpp
rename to cpp/include/raft/spectral/detail/spectral_util.hpp
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/detail/warn_dbg.hpp
similarity index 100%
rename from cpp/include/raft/spectral/warn_dbg.hpp
rename to cpp/include/raft/spectral/detail/warn_dbg.hpp
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index 156b996586..192dc15a6b 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -18,6 +18,7 @@
 #include <raft/linalg/lanczos.hpp>
 
 namespace raft {
+namespace spectral {
 
 using namespace matrix;
 
@@ -95,4 +96,6 @@ struct lanczos_solver_t {
  private:
   eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
 };
+
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
deleted file mode 100644
index bcb28bfb1e..0000000000
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ /dev/null
@@ -1,986 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cmath>
-#include <cstdio>
-#include <ctime>
-
-#include <cuda.h>
-#include <thrust/binary_search.h>
-#include <thrust/gather.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/random.h>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-#include <raft/cudart_utils.h>
-#include <raft/device_atomics.cuh>
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
-
-namespace {
-
-using namespace raft;
-using namespace raft::linalg;
-// =========================================================
-// Useful grid settings
-// =========================================================
-
-constexpr unsigned int BLOCK_SIZE      = 1024;
-constexpr unsigned int WARP_SIZE       = 32;
-constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
-
-// =========================================================
-// CUDA kernels
-// =========================================================
-
-/**
- *  @brief Compute distances between observation vectors and centroids
- *    Block dimensions should be (warpSize, 1,
- *    blockSize/warpSize). Ideally, the grid is large enough so there
- *    are d threads in the x-direction, k threads in the y-direction,
- *    and n threads in the z-direction.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param obs (Input, d*n entries) Observation matrix. Matrix is
- *    stored column-major and each column is an observation
- *    vector. Matrix dimensions are d x n.
- *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
- *    stored column-major and each column is a centroid. Matrix
- *    dimensions are d x k.
- *  @param dists (Output, n*k entries) Distance matrix. Matrix is
- *    stored column-major and the (i,j)-entry is the square of the
- *    Euclidean distance between the ith observation vector and jth
- *    centroid. Matrix dimensions are n x k. Entries must be
- *    initialized to zero.
- */
-template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(index_type_t n,
-                                        index_type_t d,
-                                        index_type_t k,
-                                        const value_type_t* __restrict__ obs,
-                                        const value_type_t* __restrict__ centroids,
-                                        value_type_t* __restrict__ dists)
-{
-  // Loop index
-  index_type_t i;
-
-  // Block indices
-  index_type_t bidx;
-  // Global indices
-  index_type_t gidx, gidy, gidz;
-
-  // Private memory
-  value_type_t centroid_private, dist_private;
-
-  // Global x-index indicates index of vector entry
-  bidx = blockIdx.x;
-  while (bidx * blockDim.x < d) {
-    gidx = threadIdx.x + bidx * blockDim.x;
-
-    // Global y-index indicates centroid
-    gidy = threadIdx.y + blockIdx.y * blockDim.y;
-    while (gidy < k) {
-      // Load centroid coordinate from global memory
-      centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0;
-
-      // Global z-index indicates observation vector
-      gidz = threadIdx.z + blockIdx.z * blockDim.z;
-      while (gidz < n) {
-        // Load observation vector coordinate from global memory
-        dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0;
-
-        // Compute contribution of current entry to distance
-        dist_private = centroid_private - dist_private;
-        dist_private = dist_private * dist_private;
-
-        // Perform reduction on warp
-        for (i = WARP_SIZE / 2; i > 0; i /= 2)
-          dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
-
-        // Write result to global memory
-        if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
-
-        // Move to another observation vector
-        gidz += blockDim.z * gridDim.z;
-      }
-
-      // Move to another centroid
-      gidy += blockDim.y * gridDim.y;
-    }
-
-    // Move to another vector entry
-    bidx += gridDim.x;
-  }
-}
-
-/**
- *  @brief Find closest centroid to observation vectors.
- *    Block and grid dimensions should be 1-dimensional. Ideally the
- *    grid is large enough so there are n threads.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param n Number of observation vectors.
- *  @param k Number of clusters.
- *  @param centroids (Input, d*k entries) Centroid matrix. Matrix is
- *    stored column-major and each column is a centroid. Matrix
- *    dimensions are d x k.
- *  @param dists (Input/output, n*k entries) Distance matrix. Matrix
- *    is stored column-major and the (i,j)-entry is the square of
- *    the Euclidean distance between the ith observation vector and
- *    jth centroid. Matrix dimensions are n x k. On exit, the first
- *    n entries give the square of the Euclidean distance between
- *    observation vectors and closest centroids.
- *  @param codes (Output, n entries) Cluster assignments.
- *  @param clusterSizes (Output, k entries) Number of points in each
- *    cluster. Entries must be initialized to zero.
- */
-template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n,
-                                    index_type_t k,
-                                    value_type_t* __restrict__ dists,
-                                    index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes)
-{
-  // Loop index
-  index_type_t i, j;
-
-  // Current matrix entry
-  value_type_t dist_curr;
-
-  // Smallest entry in row
-  value_type_t dist_min;
-  index_type_t code_min;
-
-  // Each row in observation matrix is processed by a thread
-  i = threadIdx.x + blockIdx.x * blockDim.x;
-  while (i < n) {
-    // Find minimum entry in row
-    code_min = 0;
-    dist_min = dists[IDX(i, 0, n)];
-    for (j = 1; j < k; ++j) {
-      dist_curr = dists[IDX(i, j, n)];
-      code_min  = (dist_curr < dist_min) ? j : code_min;
-      dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
-    }
-
-    // Transfer result to global memory
-    dists[i] = dist_min;
-    codes[i] = code_min;
-
-    // Increment cluster sizes
-    atomicAdd(clusterSizes + code_min, 1);
-
-    // Move to another row
-    i += blockDim.x * gridDim.x;
-  }
-}
-
-/**
- *  @brief Check if newly computed distances are smaller than old distances.
- *    Block and grid dimensions should be 1-dimensional. Ideally the
- *    grid is large enough so there are n threads.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param n Number of observation vectors.
- *  @param dists_old (Input/output, n entries) Distances between
- *    observation vectors and closest centroids. On exit, entries
- *    are replaced by entries in 'dists_new' if the corresponding
- *    observation vectors are closest to the new centroid.
- *  @param dists_new (Input, n entries) Distance between observation
- *    vectors and new centroid.
- *  @param codes_old (Input/output, n entries) Cluster
- *    assignments. On exit, entries are replaced with 'code_new' if
- *    the corresponding observation vectors are closest to the new
- *    centroid.
- *  @param code_new Index associated with new centroid.
- */
-template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances2(index_type_t n,
-                                     value_type_t* __restrict__ dists_old,
-                                     const value_type_t* __restrict__ dists_new,
-                                     index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new)
-{
-  // Loop index
-  index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
-
-  // Distances
-  value_type_t dist_old_private;
-  value_type_t dist_new_private;
-
-  // Each row is processed by a thread
-  while (i < n) {
-    // Get old and new distances
-    dist_old_private = dists_old[i];
-    dist_new_private = dists_new[i];
-
-    // Update if new distance is smaller than old distance
-    if (dist_new_private < dist_old_private) {
-      dists_old[i] = dist_new_private;
-      codes_old[i] = code_new;
-    }
-
-    // Move to another row
-    i += blockDim.x * gridDim.x;
-  }
-}
-
-/**
- *  @brief Compute size of k-means clusters.
- *    Block and grid dimensions should be 1-dimensional. Ideally the
- *    grid is large enough so there are n threads.
- *  @tparam index_type_t the type of data used for indexing.
- *  @param n Number of observation vectors.
- *  @param k Number of clusters.
- *  @param codes (Input, n entries) Cluster assignments.
- *  @param clusterSizes (Output, k entries) Number of points in each
- *    cluster. Entries must be initialized to zero.
- */
-template <typename index_type_t>
-static __global__ void computeClusterSizes(index_type_t n,
-                                           const index_type_t* __restrict__ codes,
-                                           index_type_t* __restrict__ clusterSizes)
-{
-  index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
-  while (i < n) {
-    atomicAdd(clusterSizes + codes[i], 1);
-    i += blockDim.x * gridDim.x;
-  }
-}
-
-/**
- *  @brief Divide rows of centroid matrix by cluster sizes.
- *    Divides the ith column of the sum matrix by the size of the ith
- *    cluster. If the sum matrix has been initialized so that the ith
- *    row is the sum of all observation vectors in the ith cluster,
- *    this kernel produces cluster centroids. The grid and block
- *    dimensions should be 2-dimensional. Ideally the grid is large
- *    enough so there are d threads in the x-direction and k threads
- *    in the y-direction.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param clusterSizes (Input, k entries) Number of points in each
- *    cluster.
- *  @param centroids (Input/output, d*k entries) Sum matrix. Matrix
- *    is stored column-major and matrix dimensions are d x k. The
- *    ith column is the sum of all observation vectors in the ith
- *    cluster. On exit, the matrix is the centroid matrix (each
- *    column is the mean position of a cluster).
- */
-template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(index_type_t d,
-                                       index_type_t k,
-                                       const index_type_t* __restrict__ clusterSizes,
-                                       value_type_t* __restrict__ centroids)
-{
-  // Global indices
-  index_type_t gidx, gidy;
-
-  // Current cluster size
-  index_type_t clusterSize_private;
-
-  // Observation vector is determined by global y-index
-  gidy = threadIdx.y + blockIdx.y * blockDim.y;
-  while (gidy < k) {
-    // Get cluster size from global memory
-    clusterSize_private = clusterSizes[gidy];
-
-    // Add vector entries to centroid matrix
-    //   vector entris are determined by global x-index
-    gidx = threadIdx.x + blockIdx.x * blockDim.x;
-    while (gidx < d) {
-      centroids[IDX(gidx, gidy, d)] /= clusterSize_private;
-      gidx += blockDim.x * gridDim.x;
-    }
-
-    // Move to another centroid
-    gidy += blockDim.y * gridDim.y;
-  }
-}
-
-// =========================================================
-// Helper functions
-// =========================================================
-
-/**
- *  @brief Randomly choose new centroids.
- *    Centroid is randomly chosen with k-means++ algorithm.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param rand Random number drawn uniformly from [0,1).
- *  @param obs (Input, device memory, d*n entries) Observation
- *    matrix. Matrix is stored column-major and each column is an
- *    observation vector. Matrix dimensions are n x d.
- *  @param dists (Input, device memory, 2*n entries) Workspace. The
- *    first n entries should be the distance between observation
- *    vectors and the closest centroid.
- *  @param centroid (Output, device memory, d entries) Centroid
- *    coordinates.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-static int chooseNewCentroid(handle_t const& handle,
-                             index_type_t n,
-                             index_type_t d,
-                             value_type_t rand,
-                             const value_type_t* __restrict__ obs,
-                             value_type_t* __restrict__ dists,
-                             value_type_t* __restrict__ centroid)
-{
-  // Cumulative sum of distances
-  value_type_t* distsCumSum = dists + n;
-  // Residual sum of squares
-  value_type_t distsSum{0};
-  // Observation vector that is chosen as new centroid
-  index_type_t obsIndex;
-
-  auto stream             = handle.get_stream();
-  auto thrust_exec_policy = handle.get_thrust_policy();
-
-  // Compute cumulative sum of distances
-  thrust::inclusive_scan(thrust_exec_policy,
-                         thrust::device_pointer_cast(dists),
-                         thrust::device_pointer_cast(dists + n),
-                         thrust::device_pointer_cast(distsCumSum));
-  CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(
-    &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
-
-  // Randomly choose observation vector
-  //   Probabilities are proportional to square of distance to closest
-  //   centroid (see k-means++ algorithm)
-  //
-  // seg-faults due to Thrust bug
-  // on binary-search-like algorithms
-  // when run with stream dependent
-  // execution policies; fixed on Thrust GitHub
-  // hence replace w/ linear interpolation,
-  // until the Thrust issue gets resolved:
-  //
-  // obsIndex = (thrust::lower_bound(
-  //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
-  //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
-  //             thrust::device_pointer_cast(distsCumSum));
-  //
-  // linear interpolation logic:
-  //{
-  value_type_t minSum{0};
-  RAFT_CUDA_TRY(
-    cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
-  RAFT_CHECK_CUDA(stream);
-
-  if (distsSum > minSum) {
-    value_type_t vIndex = static_cast<value_type_t>(n - 1);
-    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
-  } else {
-    obsIndex = 0;
-  }
-  //}
-
-  RAFT_CHECK_CUDA(stream);
-  obsIndex = max(obsIndex, 0);
-  obsIndex = min(obsIndex, n - 1);
-
-  // Record new centroid position
-  RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
-                                obs + IDX(0, obsIndex, d),
-                                d * sizeof(value_type_t),
-                                cudaMemcpyDeviceToDevice,
-                                stream));
-
-  return 0;
-}
-
-/**
- *  @brief Choose initial cluster centroids for k-means algorithm.
- *    Centroids are randomly chosen with k-means++ algorithm
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param obs (Input, device memory, d*n entries) Observation
- *    matrix. Matrix is stored column-major and each column is an
- *    observation vector. Matrix dimensions are d x n.
- *  @param centroids (Output, device memory, d*k entries) Centroid
- *    matrix. Matrix is stored column-major and each column is a
- *    centroid. Matrix dimensions are d x k.
- *  @param codes (Output, device memory, n entries) Cluster
- *    assignments.
- *  @param clusterSizes (Output, device memory, k entries) Number of
- *    points in each cluster.
- *  @param dists (Output, device memory, 2*n entries) Workspace. On
- *    exit, the first n entries give the square of the Euclidean
- *    distance between observation vectors and the closest centroid.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-static int initializeCentroids(handle_t const& handle,
-                               index_type_t n,
-                               index_type_t d,
-                               index_type_t k,
-                               const value_type_t* __restrict__ obs,
-                               value_type_t* __restrict__ centroids,
-                               index_type_t* __restrict__ codes,
-                               index_type_t* __restrict__ clusterSizes,
-                               value_type_t* __restrict__ dists,
-                               unsigned long long seed)
-{
-  // -------------------------------------------------------
-  // Variable declarations
-  // -------------------------------------------------------
-
-  // Loop index
-  index_type_t i;
-
-  // Random number generator
-  thrust::default_random_engine rng(seed);
-  thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
-
-  auto stream             = handle.get_stream();
-  auto thrust_exec_policy = handle.get_thrust_policy();
-
-  constexpr index_type_t grid_lower_bound{65535};
-
-  // -------------------------------------------------------
-  // Implementation
-  // -------------------------------------------------------
-
-  // Initialize grid dimensions
-  dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
-
-  // CUDA grid dimensions
-  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-                    1,
-                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
-
-  // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
-
-  // Assign observation vectors to code 0
-  RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-
-  // Choose first centroid
-  thrust::fill(thrust_exec_policy,
-               thrust::device_pointer_cast(dists),
-               thrust::device_pointer_cast(dists + n),
-               1);
-  RAFT_CHECK_CUDA(stream);
-  if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
-    WARNING("error in k-means++ (could not pick centroid)");
-
-  // Compute distances from first centroid
-  RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
-  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
-  RAFT_CHECK_CUDA(stream);
-
-  // Choose remaining centroids
-  for (i = 1; i < k; ++i) {
-    // Choose ith centroid
-    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d)))
-      WARNING("error in k-means++ (could not pick centroid)");
-
-    // Compute distances from ith centroid
-    CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
-    computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
-      n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
-    RAFT_CHECK_CUDA(stream);
-
-    // Recompute minimum distances
-    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
-    RAFT_CHECK_CUDA(stream);
-  }
-
-  // Compute cluster sizes
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
-  RAFT_CHECK_CUDA(stream);
-
-  return 0;
-}
-
-/**
- *  @brief Find cluster centroids closest to observation vectors.
- *    Distance is measured with Euclidean norm.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param obs (Input, device memory, d*n entries) Observation
- *    matrix. Matrix is stored column-major and each column is an
- *    observation vector. Matrix dimensions are d x n.
- *  @param centroids (Input, device memory, d*k entries) Centroid
- *    matrix. Matrix is stored column-major and each column is a
- *    centroid. Matrix dimensions are d x k.
- *  @param dists (Output, device memory, n*k entries) Workspace. On
- *    exit, the first n entries give the square of the Euclidean
- *    distance between observation vectors and the closest centroid.
- *  @param codes (Output, device memory, n entries) Cluster
- *    assignments.
- *  @param clusterSizes (Output, device memory, k entries) Number of
- *    points in each cluster.
- *  @param residual_host (Output, host memory, 1 entry) Residual sum
- *    of squares of assignment.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-static int assignCentroids(handle_t const& handle,
-                           index_type_t n,
-                           index_type_t d,
-                           index_type_t k,
-                           const value_type_t* __restrict__ obs,
-                           const value_type_t* __restrict__ centroids,
-                           value_type_t* __restrict__ dists,
-                           index_type_t* __restrict__ codes,
-                           index_type_t* __restrict__ clusterSizes,
-                           value_type_t* residual_host)
-{
-  auto stream             = handle.get_stream();
-  auto thrust_exec_policy = handle.get_thrust_policy();
-
-  // Compute distance between centroids and observation vectors
-  RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
-
-  // CUDA grid dimensions
-  dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
-
-  dim3 gridDim;
-  constexpr index_type_t grid_lower_bound{65535};
-  gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
-  gridDim.y = min(k, grid_lower_bound);
-  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
-
-  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
-  RAFT_CHECK_CUDA(stream);
-
-  // Find centroid closest to each observation vector
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  blockDim.x = BLOCK_SIZE;
-  blockDim.y = 1;
-  blockDim.z = 1;
-  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
-  gridDim.y  = 1;
-  gridDim.z  = 1;
-  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
-  CHECK_CUDA(stream);
-
-  // Compute residual sum of squares
-  *residual_host = thrust::reduce(
-    thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
-
-  return 0;
-}
-
-/**
- *  @brief Update cluster centroids for k-means algorithm.
- *    All clusters are assumed to be non-empty.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param obs (Input, device memory, d*n entries) Observation
- *    matrix. Matrix is stored column-major and each column is an
- *    observation vector. Matrix dimensions are d x n.
- *  @param codes (Input, device memory, n entries) Cluster
- *    assignments.
- *  @param clusterSizes (Input, device memory, k entries) Number of
- *    points in each cluster.
- *  @param centroids (Output, device memory, d*k entries) Centroid
- *    matrix. Matrix is stored column-major and each column is a
- *    centroid. Matrix dimensions are d x k.
- *  @param work (Output, device memory, n*d entries) Workspace.
- *  @param work_int (Output, device memory, 2*d*n entries)
- *    Workspace.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-static int updateCentroids(handle_t const& handle,
-                           index_type_t n,
-                           index_type_t d,
-                           index_type_t k,
-                           const value_type_t* __restrict__ obs,
-                           const index_type_t* __restrict__ codes,
-                           const index_type_t* __restrict__ clusterSizes,
-                           value_type_t* __restrict__ centroids,
-                           value_type_t* __restrict__ work,
-                           index_type_t* __restrict__ work_int)
-{
-  // -------------------------------------------------------
-  // Variable declarations
-  // -------------------------------------------------------
-
-  // Useful constants
-  const value_type_t one  = 1;
-  const value_type_t zero = 0;
-
-  constexpr index_type_t grid_lower_bound{65535};
-
-  auto stream             = handle.get_stream();
-  auto cublas_h           = handle.get_cublas_handle();
-  auto thrust_exec_policy = handle.get_thrust_policy();
-
-  // Device memory
-  thrust::device_ptr<value_type_t> obs_copy(work);
-  thrust::device_ptr<index_type_t> codes_copy(work_int);
-  thrust::device_ptr<index_type_t> rows(work_int + d * n);
-
-  // Take transpose of observation matrix
-  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                             CUBLAS_OP_T,
-                             CUBLAS_OP_N,
-                             n,
-                             d,
-                             &one,
-                             obs,
-                             d,
-                             &zero,
-                             (value_type_t*)NULL,
-                             n,
-                             thrust::raw_pointer_cast(obs_copy),
-                             n,
-                             stream));
-
-  // Cluster assigned to each observation matrix entry
-  thrust::sequence(thrust_exec_policy, rows, rows + d * n);
-  RAFT_CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy,
-                    rows,
-                    rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n),
-                    rows,
-                    thrust::modulus<index_type_t>());
-  RAFT_CHECK_CUDA(stream);
-  thrust::gather(
-    thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
-  RAFT_CHECK_CUDA(stream);
-
-  // Row associated with each observation matrix entry
-  thrust::sequence(thrust_exec_policy, rows, rows + d * n);
-  RAFT_CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy,
-                    rows,
-                    rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n),
-                    rows,
-                    thrust::divides<index_type_t>());
-  RAFT_CHECK_CUDA(stream);
-
-  // Sort and reduce to add observation vectors in same cluster
-  thrust::stable_sort_by_key(thrust_exec_policy,
-                             codes_copy,
-                             codes_copy + d * n,
-                             make_zip_iterator(make_tuple(obs_copy, rows)));
-  RAFT_CHECK_CUDA(stream);
-  thrust::reduce_by_key(thrust_exec_policy,
-                        rows,
-                        rows + d * n,
-                        obs_copy,
-                        codes_copy,  // Output to codes_copy is ignored
-                        thrust::device_pointer_cast(centroids));
-  RAFT_CHECK_CUDA(stream);
-
-  // Divide sums by cluster size to get centroid matrix
-  //
-  // CUDA grid dimensions
-  dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
-
-  // CUDA grid dimensions
-  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
-               1};
-
-  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
-  RAFT_CHECK_CUDA(stream);
-
-  return 0;
-}
-
-}  // namespace
-
-namespace raft {
-
-// =========================================================
-// k-means algorithm
-// =========================================================
-
-/**
- *  @brief Find clusters with k-means algorithm.
- *    Initial centroids are chosen with k-means++ algorithm. Empty
- *    clusters are reinitialized by choosing new centroids with
- *    k-means++ algorithm.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param tol Tolerance for convergence. k-means stops when the
- *    change in residual divided by n is less than tol.
- *  @param maxiter Maximum number of k-means iterations.
- *  @param obs (Input, device memory, d*n entries) Observation
- *    matrix. Matrix is stored column-major and each column is an
- *    observation vector. Matrix dimensions are d x n.
- *  @param codes (Output, device memory, n entries) Cluster
- *    assignments.
- *  @param clusterSizes (Output, device memory, k entries) Number of
- *    points in each cluster.
- *  @param centroids (Output, device memory, d*k entries) Centroid
- *    matrix. Matrix is stored column-major and each column is a
- *    centroid. Matrix dimensions are d x k.
- *  @param work (Output, device memory, n*max(k,d) entries)
- *    Workspace.
- *  @param work_int (Output, device memory, 2*d*n entries)
- *    Workspace.
- *  @param residual_host (Output, host memory, 1 entry) Residual sum
- *    of squares (sum of squares of distances between observation
- *    vectors and centroids).
- *  @param iters_host (Output, host memory, 1 entry) Number of
- *    k-means iterations.
- *  @param seed random seed to be used.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
-           index_type_t n,
-           index_type_t d,
-           index_type_t k,
-           value_type_t tol,
-           index_type_t maxiter,
-           const value_type_t* __restrict__ obs,
-           index_type_t* __restrict__ codes,
-           index_type_t* __restrict__ clusterSizes,
-           value_type_t* __restrict__ centroids,
-           value_type_t* __restrict__ work,
-           index_type_t* __restrict__ work_int,
-           value_type_t* residual_host,
-           index_type_t* iters_host,
-           unsigned long long seed)
-{
-  // -------------------------------------------------------
-  // Variable declarations
-  // -------------------------------------------------------
-
-  // Current iteration
-  index_type_t iter;
-
-  constexpr index_type_t grid_lower_bound{65535};
-
-  // Residual sum of squares at previous iteration
-  value_type_t residualPrev = 0;
-
-  // Random number generator
-  thrust::default_random_engine rng(seed);
-  thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
-
-  // -------------------------------------------------------
-  // Initialization
-  // -------------------------------------------------------
-
-  auto stream             = handle.get_stream();
-  auto cublas_h           = handle.get_cublas_handle();
-  auto thrust_exec_policy = handle.get_thrust_policy();
-
-  // Trivial cases
-  if (k == 1) {
-    CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(
-      cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
-      WARNING("could not compute k-means centroids");
-
-    dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
-
-    dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-      1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
-
-    CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
-    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
-    RAFT_CHECK_CUDA(stream);
-    *residual_host = thrust::reduce(
-      thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
-    RAFT_CHECK_CUDA(stream);
-    return 0;
-  }
-  if (n <= k) {
-    thrust::sequence(thrust_exec_policy,
-                     thrust::device_pointer_cast(codes),
-                     thrust::device_pointer_cast(codes + n));
-    RAFT_CHECK_CUDA(stream);
-    thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
-    RAFT_CHECK_CUDA(stream);
-
-    if (n < k)
-      RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
-    RAFT_CUDA_TRY(cudaMemcpyAsync(
-      centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
-    *residual_host = 0;
-    return 0;
-  }
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // -------------------------------------------------------
-  // k-means++ algorithm
-  // -------------------------------------------------------
-
-  // Choose initial cluster centroids
-  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
-    WARNING("could not initialize k-means centroids");
-
-  // Apply k-means iteration until convergence
-  for (iter = 0; iter < maxiter; ++iter) {
-    // Update cluster centroids
-    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
-      WARNING("could not update k-means centroids");
-
-    // Determine centroid closest to each observation
-    residualPrev = *residual_host;
-    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
-      WARNING("could not assign observation vectors to k-means clusters");
-
-    // Reinitialize empty clusters with new centroids
-    index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
-                                               thrust::device_pointer_cast(clusterSizes),
-                                               thrust::device_pointer_cast(clusterSizes + k),
-                                               0) -
-                                  thrust::device_pointer_cast(clusterSizes));
-
-    // FIXME: emptyCentroid never reaches k (infinite loop) under certain
-    // conditions, such as if obs is corrupt (as seen as a result of a
-    // DataFrame column of NULL edge vals used to create the Graph)
-    while (emptyCentroid < k) {
-      if (chooseNewCentroid(
-            handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d)))
-        WARNING("could not replace empty centroid");
-      if (assignCentroids(
-            handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
-        WARNING("could not assign observation vectors to k-means clusters");
-      emptyCentroid = (thrust::find(thrust_exec_policy,
-                                    thrust::device_pointer_cast(clusterSizes),
-                                    thrust::device_pointer_cast(clusterSizes + k),
-                                    0) -
-                       thrust::device_pointer_cast(clusterSizes));
-      RAFT_CHECK_CUDA(stream);
-    }
-
-    // Check for convergence
-    if (std::fabs(residualPrev - (*residual_host)) / n < tol) {
-      ++iter;
-      break;
-    }
-  }
-
-  // Warning if k-means has failed to converge
-  if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
-
-  *iters_host = iter;
-  return 0;
-}
-
-/**
- *  @brief Find clusters with k-means algorithm.
- *    Initial centroids are chosen with k-means++ algorithm. Empty
- *    clusters are reinitialized by choosing new centroids with
- *    k-means++ algorithm.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Number of observation vectors.
- *  @param d Dimension of observation vectors.
- *  @param k Number of clusters.
- *  @param tol Tolerance for convergence. k-means stops when the
- *    change in residual divided by n is less than tol.
- *  @param maxiter Maximum number of k-means iterations.
- *  @param obs (Input, device memory, d*n entries) Observation
- *    matrix. Matrix is stored column-major and each column is an
- *    observation vector. Matrix dimensions are d x n.
- *  @param codes (Output, device memory, n entries) Cluster
- *    assignments.
- *  @param residual On exit, residual sum of squares (sum of squares
- *    of distances between observation vectors and centroids).
- *  @param iters on exit, number of k-means iterations.
- *  @param seed random seed to be used.
- *  @return error flag
- */
-template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
-           index_type_t n,
-           index_type_t d,
-           index_type_t k,
-           value_type_t tol,
-           index_type_t maxiter,
-           const value_type_t* __restrict__ obs,
-           index_type_t* __restrict__ codes,
-           value_type_t& residual,
-           index_type_t& iters,
-           unsigned long long seed = 123456)
-{
-  using namespace matrix;
-
-  // Check that parameters are valid
-  RAFT_EXPECTS(n > 0, "invalid parameter (n<1)");
-  RAFT_EXPECTS(d > 0, "invalid parameter (d<1)");
-  RAFT_EXPECTS(k > 0, "invalid parameter (k<1)");
-  RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)");
-  RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)");
-
-  // Allocate memory
-  vector_t<index_type_t> clusterSizes(handle, k);
-  vector_t<value_type_t> centroids(handle, d * k);
-  vector_t<value_type_t> work(handle, n * max(k, d));
-  vector_t<index_type_t> work_int(handle, 2 * d * n);
-
-  // Perform k-means
-  return kmeans<index_type_t, value_type_t>(handle,
-                                            n,
-                                            d,
-                                            k,
-                                            tol,
-                                            maxiter,
-                                            obs,
-                                            codes,
-                                            clusterSizes.raw(),
-                                            centroids.raw(),
-                                            work.raw(),
-                                            work_int.raw(),
-                                            &residual,
-                                            &iters,
-                                            seed);
-}
-
-}  // namespace raft
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index c61b5f1458..1f8857200f 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -16,44 +16,13 @@
 
 #pragma once
 
-#include <math.h>
-#include <stdio.h>
-
-#include <cuda.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-#include <thrust/transform.h>
-
 #include <tuple>
 
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/spectral_util.hpp>
-
-#ifdef COLLECT_TIME_STATISTICS
-#include <cuda_profiler_api.h>
-#include <stddef.h>
-#include <sys/resource.h>
-#include <sys/sysinfo.h>
-#include <sys/time.h>
-#endif
-
-#ifdef COLLECT_TIME_STATISTICS
-static double timer(void)
-{
-  struct timeval tv;
-  cudaDeviceSynchronize();
-  gettimeofday(&tv, NULL);
-  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
-}
-#endif
+#include <raft/spectral/detail/modularity_maximization.hpp>
 
 namespace raft {
 namespace spectral {
 
-using namespace matrix;
-using namespace linalg;
-
 // =========================================================
 // Spectral modularity_maximization
 // =========================================================
@@ -81,52 +50,15 @@ using namespace linalg;
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
-  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-  EigenSolver const& eigen_solver,
-  ClusterSolver const& cluster_solver,
-  vertex_t* __restrict__ clusters,
-  weight_t* eigVals,
-  weight_t* eigVecs)
-{
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-
-  auto stream   = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
-
-  std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
-
-  vertex_t n = csr_m.nrows_;
-
-  // Compute eigenvectors of Modularity Matrix
-
-  // Initialize Modularity Matrix
-  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
-
-  auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs     = eigen_config.n_eigVecs;
-
-  // Compute eigenvectors corresponding to largest eigenvalues
-  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
-
-  // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
-
-  // notice that at this point the matrix has already been transposed, so we are scaling
-  // columns
-  scale_obs(nEigVecs, n, eigVecs);
-  RAFT_CHECK_CUDA(stream);
-
-  // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
-
-  std::get<1>(stats) = pair_cluster.first;
-  std::get<2>(stats) = pair_cluster.second;
-
-  return stats;
+        handle_t const& handle,
+        sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+        EigenSolver const& eigen_solver,
+        ClusterSolver const& cluster_solver,
+        vertex_t* __restrict__ clusters,
+        weight_t* eigVals,
+        weight_t* eigVecs) {
+    return detail::modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(handle, csr_m,
+                                                                                           eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 //===================================================
 // Analysis of graph partition
@@ -144,42 +76,9 @@ void analyzeModularity(handle_t const& handle,
                        sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
-                       weight_t& modularity)
-{
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-
-  vertex_t i;
-  vertex_t n = csr_m.nrows_;
-  weight_t partModularity, clustersize;
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // Device memory
-  vector_t<weight_t> part_i(handle, n);
-  vector_t<weight_t> Bx(handle, n);
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // Initialize Modularity
-  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
-
-  // Initialize output
-  modularity = 0;
-
-  // Iterate through partitions
-  for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
-      WARNING("empty partition");
-      continue;
-    }
-
-    // Record results
-    modularity += partModularity;
-  }
-
-  modularity = modularity / B.diagonal_.nrm1();
+                       weight_t& modularity) {
+    detail::analyzeModularity<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, modularity);
+}
 }
 
 }  // namespace spectral
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 5b1478baa9..ca2b71ae49 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -15,26 +15,13 @@
  */
 #pragma once
 
-#include <math.h>
-#include <stdio.h>
-
-#include <cuda.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-#include <thrust/transform.h>
-
 #include <tuple>
 
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/spectral_util.hpp>
+#include <raft/spectral/detail/partition.hpp>
 
 namespace raft {
 namespace spectral {
 
-using namespace matrix;
-using namespace linalg;
-
 // =========================================================
 // Spectral partitioner
 // =========================================================
@@ -68,47 +55,10 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
                                                    ClusterSolver const& cluster_solver,
                                                    vertex_t* __restrict__ clusters,
                                                    weight_t* eigVals,
-                                                   weight_t* eigVecs)
-{
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-
-  auto stream   = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
-
-  std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
-            // cluster solver residual, # iters cluster solver
-
-  vertex_t n = csr_m.nrows_;
-
-  // -------------------------------------------------------
-  // Spectral partitioner
-  // -------------------------------------------------------
-
-  // Compute eigenvectors of Laplacian
-
-  // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
-
-  auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs     = eigen_config.n_eigVecs;
-
-  // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
-
-  // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
-
-  // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
-
-  std::get<1>(stats) = pair_cluster.first;
-  std::get<2>(stats) = pair_cluster.second;
-
-  return stats;
+                                                   weight_t* eigVecs) {
+    return detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(handle, csr_m, eigen_solver,
+                                                                             cluster_solver, clusters,
+                                                                             eigVals, eigVecs);
 }
 
 // =========================================================
@@ -135,45 +85,8 @@ void analyzePartition(handle_t const& handle,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
-                      weight_t& cost)
-{
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-
-  vertex_t i;
-  vertex_t n = csr_m.nrows_;
-
-  auto stream   = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
-
-  weight_t partEdgesCut, clustersize;
-
-  // Device memory
-  vector_t<weight_t> part_i(handle, n);
-  vector_t<weight_t> Lx(handle, n);
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
-
-  // Initialize output
-  cost    = 0;
-  edgeCut = 0;
-
-  // Iterate through partitions
-  for (i = 0; i < nClusters; ++i) {
-    // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
-      WARNING("empty partition");
-      continue;
-    }
-
-    // Record results
-    cost += partEdgesCut / clustersize;
-    edgeCut += partEdgesCut / 2;
-  }
+                      weight_t& cost) {
+    detail::analyzePartition<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, edgeCut, cost);
 }
 
 }  // namespace spectral

From cca8c839b57b63924b2566ffa31815e3be0500b0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Feb 2022 17:08:20 -0500
Subject: [PATCH 2/8] Updating necessary headers to remove device/host buffer
 from raft, update lap, spectral, cluster, and label packages

---
 cpp/include/raft/cluster/detail/kmeans.hpp    | 1323 ++++++++---------
 cpp/include/raft/cluster/kmeans.hpp           |   21 +-
 cpp/include/raft/comms/helper.hpp             |    1 -
 cpp/include/raft/comms/std_comms.hpp          |    2 -
 cpp/include/raft/label/classlabels.hpp        |   18 +-
 .../raft/label/detail/merge_labels.cuh        |    2 +
 cpp/include/raft/label/merge_labels.hpp       |   20 +-
 cpp/include/raft/lap/detail/lap_functions.cuh |    2 +-
 cpp/include/raft/lap/detail/lap_kernels.cuh   |    1 -
 cpp/include/raft/linalg/lanczos.hpp           |    6 +-
 cpp/include/raft/mr/buffer_base.hpp           |  211 ---
 cpp/include/raft/mr/device/buffer.hpp         |   70 -
 cpp/include/raft/mr/host/buffer.hpp           |   85 --
 .../raft/sparse/distance/detail/coo_spmv.cuh  |    1 -
 .../raft/sparse/distance/detail/utils.cuh     |    1 -
 cpp/include/raft/sparse/distance/distance.hpp |    1 -
 .../sparse/hierarchy/detail/agglomerative.cuh |    1 -
 .../hierarchy/detail/connectivities.cuh       |    1 -
 .../raft/sparse/hierarchy/detail/mst.cuh      |    1 -
 .../raft/sparse/linalg/detail/spectral.cuh    |    2 +
 cpp/include/raft/sparse/op/detail/reduce.cuh  |    1 -
 .../selection/detail/connect_components.cuh   |    3 +-
 .../raft/sparse/selection/detail/knn.cuh      |    1 -
 cpp/include/raft/spatial/knn/ann.hpp          |    2 -
 cpp/include/raft/spatial/knn/knn.hpp          |    4 -
 cpp/include/raft/spectral/cluster_solvers.hpp |   20 +-
 .../detail/modularity_maximization.hpp        |    2 +-
 .../raft/spectral/detail/partition.hpp        |    8 +-
 .../raft/spectral/modularity_maximization.hpp |   25 +-
 cpp/include/raft/spectral/partition.hpp       |   13 +-
 cpp/test/CMakeLists.txt                       |    2 -
 cpp/test/cluster_solvers.cu                   |   10 +-
 cpp/test/eigen_solvers.cu                     |   17 +-
 cpp/test/label/label.cu                       |    2 +-
 cpp/test/label/merge_labels.cu                |    2 +-
 cpp/test/lap/lap.cu                           |    2 +-
 cpp/test/mr/device/buffer.cpp                 |   92 --
 cpp/test/mr/host/buffer.cpp                   |   71 -
 cpp/test/spectral_matrix.cu                   |    2 +-
 39 files changed, 762 insertions(+), 1287 deletions(-)
 delete mode 100644 cpp/include/raft/mr/buffer_base.hpp
 delete mode 100644 cpp/include/raft/mr/device/buffer.hpp
 delete mode 100644 cpp/include/raft/mr/host/buffer.hpp
 delete mode 100644 cpp/test/mr/device/buffer.cpp
 delete mode 100644 cpp/test/mr/host/buffer.cpp

diff --git a/cpp/include/raft/cluster/detail/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.hpp
index cf7255c22e..e5911bc7d9 100644
--- a/cpp/include/raft/cluster/detail/kmeans.hpp
+++ b/cpp/include/raft/cluster/detail/kmeans.hpp
@@ -32,8 +32,8 @@
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/cublas_wrappers.h>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include <raft/spectral/detail/matrix_wrappers.hpp>
+#include <raft/spectral/detail/warn_dbg.hpp>
 
 namespace raft {
 namespace cluster {
@@ -42,9 +42,9 @@ namespace detail {
 // Useful grid settings
 // =========================================================
 
-    constexpr unsigned int BLOCK_SIZE      = 1024;
-    constexpr unsigned int WARP_SIZE       = 32;
-    constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
+constexpr unsigned int BLOCK_SIZE      = 1024;
+constexpr unsigned int WARP_SIZE       = 32;
+constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
 
 // =========================================================
 // CUDA kernels
@@ -73,66 +73,66 @@ namespace detail {
  *    centroid. Matrix dimensions are n x k. Entries must be
  *    initialized to zero.
  */
-    template <typename index_type_t, typename value_type_t>
-    static __global__ void computeDistances(index_type_t n,
-                                            index_type_t d,
-                                            index_type_t k,
-                                            const value_type_t* __restrict__ obs,
-                                            const value_type_t* __restrict__ centroids,
-                                            value_type_t* __restrict__ dists)
-    {
-        // Loop index
-        index_type_t i;
-
-        // Block indices
-        index_type_t bidx;
-        // Global indices
-        index_type_t gidx, gidy, gidz;
-
-        // Private memory
-        value_type_t centroid_private, dist_private;
-
-        // Global x-index indicates index of vector entry
-        bidx = blockIdx.x;
-        while (bidx * blockDim.x < d) {
-            gidx = threadIdx.x + bidx * blockDim.x;
-
-            // Global y-index indicates centroid
-            gidy = threadIdx.y + blockIdx.y * blockDim.y;
-            while (gidy < k) {
-                // Load centroid coordinate from global memory
-                centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0;
-
-                // Global z-index indicates observation vector
-                gidz = threadIdx.z + blockIdx.z * blockDim.z;
-                while (gidz < n) {
-                    // Load observation vector coordinate from global memory
-                    dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0;
-
-                    // Compute contribution of current entry to distance
-                    dist_private = centroid_private - dist_private;
-                    dist_private = dist_private * dist_private;
-
-                    // Perform reduction on warp
-                    for (i = WARP_SIZE / 2; i > 0; i /= 2)
-                        dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
-
-                    // Write result to global memory
-                    if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
-
-                    // Move to another observation vector
-                    gidz += blockDim.z * gridDim.z;
-                }
-
-                // Move to another centroid
-                gidy += blockDim.y * gridDim.y;
-            }
-
-            // Move to another vector entry
-            bidx += gridDim.x;
-        }
+template <typename index_type_t, typename value_type_t>
+static __global__ void computeDistances(index_type_t n,
+                                        index_type_t d,
+                                        index_type_t k,
+                                        const value_type_t* __restrict__ obs,
+                                        const value_type_t* __restrict__ centroids,
+                                        value_type_t* __restrict__ dists)
+{
+  // Loop index
+  index_type_t i;
+
+  // Block indices
+  index_type_t bidx;
+  // Global indices
+  index_type_t gidx, gidy, gidz;
+
+  // Private memory
+  value_type_t centroid_private, dist_private;
+
+  // Global x-index indicates index of vector entry
+  bidx = blockIdx.x;
+  while (bidx * blockDim.x < d) {
+    gidx = threadIdx.x + bidx * blockDim.x;
+
+    // Global y-index indicates centroid
+    gidy = threadIdx.y + blockIdx.y * blockDim.y;
+    while (gidy < k) {
+      // Load centroid coordinate from global memory
+      centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0;
+
+      // Global z-index indicates observation vector
+      gidz = threadIdx.z + blockIdx.z * blockDim.z;
+      while (gidz < n) {
+        // Load observation vector coordinate from global memory
+        dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0;
+
+        // Compute contribution of current entry to distance
+        dist_private = centroid_private - dist_private;
+        dist_private = dist_private * dist_private;
+
+        // Perform reduction on warp
+        for (i = WARP_SIZE / 2; i > 0; i /= 2)
+          dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
+
+        // Write result to global memory
+        if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
+
+        // Move to another observation vector
+        gidz += blockDim.z * gridDim.z;
+      }
+
+      // Move to another centroid
+      gidy += blockDim.y * gridDim.y;
     }
 
+    // Move to another vector entry
+    bidx += gridDim.x;
+  }
+}
+
 /**
  *  @brief Find closest centroid to observation vectors.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
@@ -154,47 +154,47 @@ namespace detail {
  *  @param clusterSizes (Output, k entries) Number of points in each
  *    cluster. Entries must be initialized to zero.
  */
-    template <typename index_type_t, typename value_type_t>
-    static __global__ void minDistances(index_type_t n,
-                                        index_type_t k,
-                                        value_type_t* __restrict__ dists,
-                                        index_type_t* __restrict__ codes,
-                                        index_type_t* __restrict__ clusterSizes)
-    {
-        // Loop index
-        index_type_t i, j;
-
-        // Current matrix entry
-        value_type_t dist_curr;
-
-        // Smallest entry in row
-        value_type_t dist_min;
-        index_type_t code_min;
-
-        // Each row in observation matrix is processed by a thread
-        i = threadIdx.x + blockIdx.x * blockDim.x;
-        while (i < n) {
-            // Find minimum entry in row
-            code_min = 0;
-            dist_min = dists[IDX(i, 0, n)];
-            for (j = 1; j < k; ++j) {
-                dist_curr = dists[IDX(i, j, n)];
-                code_min  = (dist_curr < dist_min) ? j : code_min;
-                dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
-            }
-
-            // Transfer result to global memory
-            dists[i] = dist_min;
-            codes[i] = code_min;
-
-            // Increment cluster sizes
-            atomicAdd(clusterSizes + code_min, 1);
-
-            // Move to another row
-            i += blockDim.x * gridDim.x;
-        }
+template <typename index_type_t, typename value_type_t>
+static __global__ void minDistances(index_type_t n,
+                                    index_type_t k,
+                                    value_type_t* __restrict__ dists,
+                                    index_type_t* __restrict__ codes,
+                                    index_type_t* __restrict__ clusterSizes)
+{
+  // Loop index
+  index_type_t i, j;
+
+  // Current matrix entry
+  value_type_t dist_curr;
+
+  // Smallest entry in row
+  value_type_t dist_min;
+  index_type_t code_min;
+
+  // Each row in observation matrix is processed by a thread
+  i = threadIdx.x + blockIdx.x * blockDim.x;
+  while (i < n) {
+    // Find minimum entry in row
+    code_min = 0;
+    dist_min = dists[IDX(i, 0, n)];
+    for (j = 1; j < k; ++j) {
+      dist_curr = dists[IDX(i, j, n)];
+      code_min  = (dist_curr < dist_min) ? j : code_min;
+      dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
     }
 
+    // Transfer result to global memory
+    dists[i] = dist_min;
+    codes[i] = code_min;
+
+    // Increment cluster sizes
+    atomicAdd(clusterSizes + code_min, 1);
+
+    // Move to another row
+    i += blockDim.x * gridDim.x;
+  }
+}
+
 /**
  *  @brief Check if newly computed distances are smaller than old distances.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
@@ -214,37 +214,37 @@ namespace detail {
  *    centroid.
  *  @param code_new Index associated with new centroid.
  */
-    template <typename index_type_t, typename value_type_t>
-    static __global__ void minDistances2(index_type_t n,
-                                         value_type_t* __restrict__ dists_old,
-                                         const value_type_t* __restrict__ dists_new,
-                                         index_type_t* __restrict__ codes_old,
-                                         index_type_t code_new)
-    {
-        // Loop index
-        index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
-
-        // Distances
-        value_type_t dist_old_private;
-        value_type_t dist_new_private;
-
-        // Each row is processed by a thread
-        while (i < n) {
-            // Get old and new distances
-            dist_old_private = dists_old[i];
-            dist_new_private = dists_new[i];
-
-            // Update if new distance is smaller than old distance
-            if (dist_new_private < dist_old_private) {
-                dists_old[i] = dist_new_private;
-                codes_old[i] = code_new;
-            }
-
-            // Move to another row
-            i += blockDim.x * gridDim.x;
-        }
+template <typename index_type_t, typename value_type_t>
+static __global__ void minDistances2(index_type_t n,
+                                     value_type_t* __restrict__ dists_old,
+                                     const value_type_t* __restrict__ dists_new,
+                                     index_type_t* __restrict__ codes_old,
+                                     index_type_t code_new)
+{
+  // Loop index
+  index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // Distances
+  value_type_t dist_old_private;
+  value_type_t dist_new_private;
+
+  // Each row is processed by a thread
+  while (i < n) {
+    // Get old and new distances
+    dist_old_private = dists_old[i];
+    dist_new_private = dists_new[i];
+
+    // Update if new distance is smaller than old distance
+    if (dist_new_private < dist_old_private) {
+      dists_old[i] = dist_new_private;
+      codes_old[i] = code_new;
     }
 
+    // Move to another row
+    i += blockDim.x * gridDim.x;
+  }
+}
+
 /**
  *  @brief Compute size of k-means clusters.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
@@ -256,17 +256,17 @@ namespace detail {
  *  @param clusterSizes (Output, k entries) Number of points in each
  *    cluster. Entries must be initialized to zero.
  */
-    template <typename index_type_t>
-    static __global__ void computeClusterSizes(index_type_t n,
-                                               const index_type_t* __restrict__ codes,
-                                               index_type_t* __restrict__ clusterSizes)
-    {
-        index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
-        while (i < n) {
-            atomicAdd(clusterSizes + codes[i], 1);
-            i += blockDim.x * gridDim.x;
-        }
-    }
+template <typename index_type_t>
+static __global__ void computeClusterSizes(index_type_t n,
+                                           const index_type_t* __restrict__ codes,
+                                           index_type_t* __restrict__ clusterSizes)
+{
+  index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
+  while (i < n) {
+    atomicAdd(clusterSizes + codes[i], 1);
+    i += blockDim.x * gridDim.x;
+  }
+}
 
 /**
  *  @brief Divide rows of centroid matrix by cluster sizes.
@@ -289,37 +289,37 @@ namespace detail {
  *    cluster. On exit, the matrix is the centroid matrix (each
  *    column is the mean position of a cluster).
  */
-    template <typename index_type_t, typename value_type_t>
-    static __global__ void divideCentroids(index_type_t d,
-                                           index_type_t k,
-                                           const index_type_t* __restrict__ clusterSizes,
-                                           value_type_t* __restrict__ centroids)
-    {
-        // Global indices
-        index_type_t gidx, gidy;
-
-        // Current cluster size
-        index_type_t clusterSize_private;
-
-        // Observation vector is determined by global y-index
-        gidy = threadIdx.y + blockIdx.y * blockDim.y;
-        while (gidy < k) {
-            // Get cluster size from global memory
-            clusterSize_private = clusterSizes[gidy];
-
-            // Add vector entries to centroid matrix
-            //   vector entris are determined by global x-index
-            gidx = threadIdx.x + blockIdx.x * blockDim.x;
-            while (gidx < d) {
-                centroids[IDX(gidx, gidy, d)] /= clusterSize_private;
-                gidx += blockDim.x * gridDim.x;
-            }
-
-            // Move to another centroid
-            gidy += blockDim.y * gridDim.y;
-        }
+template <typename index_type_t, typename value_type_t>
+static __global__ void divideCentroids(index_type_t d,
+                                       index_type_t k,
+                                       const index_type_t* __restrict__ clusterSizes,
+                                       value_type_t* __restrict__ centroids)
+{
+  // Global indices
+  index_type_t gidx, gidy;
+
+  // Current cluster size
+  index_type_t clusterSize_private;
+
+  // Observation vector is determined by global y-index
+  gidy = threadIdx.y + blockIdx.y * blockDim.y;
+  while (gidy < k) {
+    // Get cluster size from global memory
+    clusterSize_private = clusterSizes[gidy];
+
+    // Add vector entries to centroid matrix
+    //   vector entris are determined by global x-index
+    gidx = threadIdx.x + blockIdx.x * blockDim.x;
+    while (gidx < d) {
+      centroids[IDX(gidx, gidy, d)] /= clusterSize_private;
+      gidx += blockDim.x * gridDim.x;
     }
 
+    // Move to another centroid
+    gidy += blockDim.y * gridDim.y;
+  }
+}
+
 // =========================================================
 // Helper functions
 // =========================================================
@@ -344,78 +344,78 @@ namespace detail {
  *    coordinates.
  *  @return Zero if successful. Otherwise non-zero.
  */
-    template <typename index_type_t, typename value_type_t>
-    static int chooseNewCentroid(handle_t const& handle,
-                                 index_type_t n,
-                                 index_type_t d,
-                                 value_type_t rand,
-                                 const value_type_t* __restrict__ obs,
-                                 value_type_t* __restrict__ dists,
-                                 value_type_t* __restrict__ centroid)
-    {
-        // Cumulative sum of distances
-        value_type_t* distsCumSum = dists + n;
-        // Residual sum of squares
-        value_type_t distsSum{0};
-        // Observation vector that is chosen as new centroid
-        index_type_t obsIndex;
-
-        auto stream             = handle.get_stream();
-        auto thrust_exec_policy = handle.get_thrust_policy();
-
-        // Compute cumulative sum of distances
-        thrust::inclusive_scan(thrust_exec_policy,
-                               thrust::device_pointer_cast(dists),
-                               thrust::device_pointer_cast(dists + n),
-                               thrust::device_pointer_cast(distsCumSum));
-        CHECK_CUDA(stream);
-        CUDA_TRY(cudaMemcpyAsync(
-                &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
-
-        // Randomly choose observation vector
-        //   Probabilities are proportional to square of distance to closest
-        //   centroid (see k-means++ algorithm)
-        //
-        // seg-faults due to Thrust bug
-        // on binary-search-like algorithms
-        // when run with stream dependent
-        // execution policies; fixed on Thrust GitHub
-        // hence replace w/ linear interpolation,
-        // until the Thrust issue gets resolved:
-        //
-        // obsIndex = (thrust::lower_bound(
-        //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
-        //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
-        //             thrust::device_pointer_cast(distsCumSum));
-        //
-        // linear interpolation logic:
-        //{
-        value_type_t minSum{0};
-        RAFT_CUDA_TRY(
-                cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
-        RAFT_CHECK_CUDA(stream);
-
-        if (distsSum > minSum) {
-            value_type_t vIndex = static_cast<value_type_t>(n - 1);
-            obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
-        } else {
-            obsIndex = 0;
-        }
-        //}
-
-        RAFT_CHECK_CUDA(stream);
-        obsIndex = max(obsIndex, 0);
-        obsIndex = min(obsIndex, n - 1);
-
-        // Record new centroid position
-        RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
-                                      obs + IDX(0, obsIndex, d),
-                                      d * sizeof(value_type_t),
-                                      cudaMemcpyDeviceToDevice,
-                                      stream));
-
-        return 0;
-    }
+template <typename index_type_t, typename value_type_t>
+static int chooseNewCentroid(handle_t const& handle,
+                             index_type_t n,
+                             index_type_t d,
+                             value_type_t rand,
+                             const value_type_t* __restrict__ obs,
+                             value_type_t* __restrict__ dists,
+                             value_type_t* __restrict__ centroid)
+{
+  // Cumulative sum of distances
+  value_type_t* distsCumSum = dists + n;
+  // Residual sum of squares
+  value_type_t distsSum{0};
+  // Observation vector that is chosen as new centroid
+  index_type_t obsIndex;
+
+  auto stream             = handle.get_stream();
+  auto thrust_exec_policy = handle.get_thrust_policy();
+
+  // Compute cumulative sum of distances
+  thrust::inclusive_scan(thrust_exec_policy,
+                         thrust::device_pointer_cast(dists),
+                         thrust::device_pointer_cast(dists + n),
+                         thrust::device_pointer_cast(distsCumSum));
+  CHECK_CUDA(stream);
+  CUDA_TRY(cudaMemcpyAsync(
+    &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
+
+  // Randomly choose observation vector
+  //   Probabilities are proportional to square of distance to closest
+  //   centroid (see k-means++ algorithm)
+  //
+  // seg-faults due to Thrust bug
+  // on binary-search-like algorithms
+  // when run with stream dependent
+  // execution policies; fixed on Thrust GitHub
+  // hence replace w/ linear interpolation,
+  // until the Thrust issue gets resolved:
+  //
+  // obsIndex = (thrust::lower_bound(
+  //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
+  //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
+  //             thrust::device_pointer_cast(distsCumSum));
+  //
+  // linear interpolation logic:
+  //{
+  value_type_t minSum{0};
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
+  RAFT_CHECK_CUDA(stream);
+
+  if (distsSum > minSum) {
+    value_type_t vIndex = static_cast<value_type_t>(n - 1);
+    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
+  } else {
+    obsIndex = 0;
+  }
+  //}
+
+  RAFT_CHECK_CUDA(stream);
+  obsIndex = max(obsIndex, 0);
+  obsIndex = min(obsIndex, n - 1);
+
+  // Record new centroid position
+  RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
+                                obs + IDX(0, obsIndex, d),
+                                d * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+
+  return 0;
+}
 
 /**
  *  @brief Choose initial cluster centroids for k-means algorithm.
@@ -441,90 +441,90 @@ namespace detail {
  *    distance between observation vectors and the closest centroid.
  *  @return Zero if successful. Otherwise non-zero.
  */
-    template <typename index_type_t, typename value_type_t>
-    static int initializeCentroids(handle_t const& handle,
-                                   index_type_t n,
-                                   index_type_t d,
-                                   index_type_t k,
-                                   const value_type_t* __restrict__ obs,
-                                   value_type_t* __restrict__ centroids,
-                                   index_type_t* __restrict__ codes,
-                                   index_type_t* __restrict__ clusterSizes,
-                                   value_type_t* __restrict__ dists,
-                                   unsigned long long seed)
-    {
-        // -------------------------------------------------------
-        // Variable declarations
-        // -------------------------------------------------------
-
-        // Loop index
-        index_type_t i;
-
-        // Random number generator
-        thrust::default_random_engine rng(seed);
-        thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
-
-        auto stream             = handle.get_stream();
-        auto thrust_exec_policy = handle.get_thrust_policy();
-
-        constexpr index_type_t grid_lower_bound{65535};
-
-        // -------------------------------------------------------
-        // Implementation
-        // -------------------------------------------------------
-
-        // Initialize grid dimensions
-        dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
-
-        // CUDA grid dimensions
-        dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-                          1,
-                          min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
-
-        // CUDA grid dimensions
-        dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
-
-        // Assign observation vectors to code 0
-        RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-
-        // Choose first centroid
-        thrust::fill(thrust_exec_policy,
-                     thrust::device_pointer_cast(dists),
-                     thrust::device_pointer_cast(dists + n),
-                     1);
-        RAFT_CHECK_CUDA(stream);
-        if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
-            WARNING("error in k-means++ (could not pick centroid)");
-
-        // Compute distances from first centroid
-        RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
-        computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
-        RAFT_CHECK_CUDA(stream);
-
-        // Choose remaining centroids
-        for (i = 1; i < k; ++i) {
-            // Choose ith centroid
-            if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d)))
-                WARNING("error in k-means++ (could not pick centroid)");
-
-            // Compute distances from ith centroid
-            CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
-            computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
-                    n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
-            RAFT_CHECK_CUDA(stream);
-
-            // Recompute minimum distances
-            minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
-            RAFT_CHECK_CUDA(stream);
-        }
-
-        // Compute cluster sizes
-        CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-        computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
-        RAFT_CHECK_CUDA(stream);
-
-        return 0;
-    }
+template <typename index_type_t, typename value_type_t>
+static int initializeCentroids(handle_t const& handle,
+                               index_type_t n,
+                               index_type_t d,
+                               index_type_t k,
+                               const value_type_t* __restrict__ obs,
+                               value_type_t* __restrict__ centroids,
+                               index_type_t* __restrict__ codes,
+                               index_type_t* __restrict__ clusterSizes,
+                               value_type_t* __restrict__ dists,
+                               unsigned long long seed)
+{
+  // -------------------------------------------------------
+  // Variable declarations
+  // -------------------------------------------------------
+
+  // Loop index
+  index_type_t i;
+
+  // Random number generator
+  thrust::default_random_engine rng(seed);
+  thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
+
+  auto stream             = handle.get_stream();
+  auto thrust_exec_policy = handle.get_thrust_policy();
+
+  constexpr index_type_t grid_lower_bound{65535};
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Initialize grid dimensions
+  dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
+
+  // CUDA grid dimensions
+  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                    1,
+                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+
+  // CUDA grid dimensions
+  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
+
+  // Assign observation vectors to code 0
+  RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+
+  // Choose first centroid
+  thrust::fill(thrust_exec_policy,
+               thrust::device_pointer_cast(dists),
+               thrust::device_pointer_cast(dists + n),
+               1);
+  RAFT_CHECK_CUDA(stream);
+  if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
+    WARNING("error in k-means++ (could not pick centroid)");
+
+  // Compute distances from first centroid
+  RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
+  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
+  RAFT_CHECK_CUDA(stream);
+
+  // Choose remaining centroids
+  for (i = 1; i < k; ++i) {
+    // Choose ith centroid
+    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d)))
+      WARNING("error in k-means++ (could not pick centroid)");
+
+    // Compute distances from ith centroid
+    CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
+    computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
+      n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
+    RAFT_CHECK_CUDA(stream);
+
+    // Recompute minimum distances
+    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
+    RAFT_CHECK_CUDA(stream);
+  }
+
+  // Compute cluster sizes
+  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
+  RAFT_CHECK_CUDA(stream);
+
+  return 0;
+}
 
 /**
  *  @brief Find cluster centroids closest to observation vectors.
@@ -552,53 +552,53 @@ namespace detail {
  *    of squares of assignment.
  *  @return Zero if successful. Otherwise non-zero.
  */
-    template <typename index_type_t, typename value_type_t>
-    static int assignCentroids(handle_t const& handle,
-                               index_type_t n,
-                               index_type_t d,
-                               index_type_t k,
-                               const value_type_t* __restrict__ obs,
-                               const value_type_t* __restrict__ centroids,
-                               value_type_t* __restrict__ dists,
-                               index_type_t* __restrict__ codes,
-                               index_type_t* __restrict__ clusterSizes,
-                               value_type_t* residual_host)
-    {
-        auto stream             = handle.get_stream();
-        auto thrust_exec_policy = handle.get_thrust_policy();
-
-        // Compute distance between centroids and observation vectors
-        RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
-
-        // CUDA grid dimensions
-        dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
-
-        dim3 gridDim;
-        constexpr index_type_t grid_lower_bound{65535};
-        gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
-        gridDim.y = min(k, grid_lower_bound);
-        gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
-
-        computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
-        RAFT_CHECK_CUDA(stream);
-
-        // Find centroid closest to each observation vector
-        CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-        blockDim.x = BLOCK_SIZE;
-        blockDim.y = 1;
-        blockDim.z = 1;
-        gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
-        gridDim.y  = 1;
-        gridDim.z  = 1;
-        minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
-        CHECK_CUDA(stream);
-
-        // Compute residual sum of squares
-        *residual_host = thrust::reduce(
-                thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
-
-        return 0;
-    }
+template <typename index_type_t, typename value_type_t>
+static int assignCentroids(handle_t const& handle,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
+                           const value_type_t* __restrict__ obs,
+                           const value_type_t* __restrict__ centroids,
+                           value_type_t* __restrict__ dists,
+                           index_type_t* __restrict__ codes,
+                           index_type_t* __restrict__ clusterSizes,
+                           value_type_t* residual_host)
+{
+  auto stream             = handle.get_stream();
+  auto thrust_exec_policy = handle.get_thrust_policy();
+
+  // Compute distance between centroids and observation vectors
+  RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
+
+  // CUDA grid dimensions
+  dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
+
+  dim3 gridDim;
+  constexpr index_type_t grid_lower_bound{65535};
+  gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
+  gridDim.y = min(k, grid_lower_bound);
+  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+
+  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
+  RAFT_CHECK_CUDA(stream);
+
+  // Find centroid closest to each observation vector
+  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  blockDim.x = BLOCK_SIZE;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.y  = 1;
+  gridDim.z  = 1;
+  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
+  CHECK_CUDA(stream);
+
+  // Compute residual sum of squares
+  *residual_host = thrust::reduce(
+    thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
+
+  return 0;
+}
 
 /**
  *  @brief Update cluster centroids for k-means algorithm.
@@ -624,111 +624,107 @@ namespace detail {
  *    Workspace.
  *  @return Zero if successful. Otherwise non-zero.
  */
-    template <typename index_type_t, typename value_type_t>
-    static int updateCentroids(handle_t const& handle,
-                               index_type_t n,
-                               index_type_t d,
-                               index_type_t k,
-                               const value_type_t* __restrict__ obs,
-                               const index_type_t* __restrict__ codes,
-                               const index_type_t* __restrict__ clusterSizes,
-                               value_type_t* __restrict__ centroids,
-                               value_type_t* __restrict__ work,
-                               index_type_t* __restrict__ work_int)
-    {
-        // -------------------------------------------------------
-        // Variable declarations
-        // -------------------------------------------------------
-
-        // Useful constants
-        const value_type_t one  = 1;
-        const value_type_t zero = 0;
-
-        constexpr index_type_t grid_lower_bound{65535};
-
-        auto stream             = handle.get_stream();
-        auto cublas_h           = handle.get_cublas_handle();
-        auto thrust_exec_policy = handle.get_thrust_policy();
-
-        // Device memory
-        thrust::device_ptr<value_type_t> obs_copy(work);
-        thrust::device_ptr<index_type_t> codes_copy(work_int);
-        thrust::device_ptr<index_type_t> rows(work_int + d * n);
-
-        // Take transpose of observation matrix
-        RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                                   CUBLAS_OP_T,
-                                   CUBLAS_OP_N,
-                                   n,
-                                   d,
-                                   &one,
-                                   obs,
-                                   d,
-                                   &zero,
-                                   (value_type_t*)NULL,
-                                   n,
-                                   thrust::raw_pointer_cast(obs_copy),
-                                   n,
-                                   stream));
-
-        // Cluster assigned to each observation matrix entry
-        thrust::sequence(thrust_exec_policy, rows, rows + d * n);
-        RAFT_CHECK_CUDA(stream);
-        thrust::transform(thrust_exec_policy,
-                          rows,
-                          rows + d * n,
-                          thrust::make_constant_iterator<index_type_t>(n),
-                          rows,
-                          thrust::modulus<index_type_t>());
-        RAFT_CHECK_CUDA(stream);
-        thrust::gather(
-                thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
-        RAFT_CHECK_CUDA(stream);
-
-        // Row associated with each observation matrix entry
-        thrust::sequence(thrust_exec_policy, rows, rows + d * n);
-        RAFT_CHECK_CUDA(stream);
-        thrust::transform(thrust_exec_policy,
-                          rows,
-                          rows + d * n,
-                          thrust::make_constant_iterator<index_type_t>(n),
-                          rows,
-                          thrust::divides<index_type_t>());
-        RAFT_CHECK_CUDA(stream);
-
-        // Sort and reduce to add observation vectors in same cluster
-        thrust::stable_sort_by_key(thrust_exec_policy,
-                                   codes_copy,
-                                   codes_copy + d * n,
-                                   make_zip_iterator(make_tuple(obs_copy, rows)));
-        RAFT_CHECK_CUDA(stream);
-        thrust::reduce_by_key(thrust_exec_policy,
-                              rows,
-                              rows + d * n,
-                              obs_copy,
-                              codes_copy,  // Output to codes_copy is ignored
-                              thrust::device_pointer_cast(centroids));
-        RAFT_CHECK_CUDA(stream);
-
-        // Divide sums by cluster size to get centroid matrix
-        //
-        // CUDA grid dimensions
-        dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
-
-        // CUDA grid dimensions
-        dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-                     min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
-                     1};
-
-        divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
-        RAFT_CHECK_CUDA(stream);
-
-        return 0;
-    }
-
-}  // namespace
-
-namespace raft {
+template <typename index_type_t, typename value_type_t>
+static int updateCentroids(handle_t const& handle,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
+                           const value_type_t* __restrict__ obs,
+                           const index_type_t* __restrict__ codes,
+                           const index_type_t* __restrict__ clusterSizes,
+                           value_type_t* __restrict__ centroids,
+                           value_type_t* __restrict__ work,
+                           index_type_t* __restrict__ work_int)
+{
+  // -------------------------------------------------------
+  // Variable declarations
+  // -------------------------------------------------------
+
+  // Useful constants
+  const value_type_t one  = 1;
+  const value_type_t zero = 0;
+
+  constexpr index_type_t grid_lower_bound{65535};
+
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
+
+  // Device memory
+  thrust::device_ptr<value_type_t> obs_copy(work);
+  thrust::device_ptr<index_type_t> codes_copy(work_int);
+  thrust::device_ptr<index_type_t> rows(work_int + d * n);
+
+  // Take transpose of observation matrix
+  RAFT_CUBLAS_TRY(linalg::cublasgeam(cublas_h,
+                                     CUBLAS_OP_T,
+                                     CUBLAS_OP_N,
+                                     n,
+                                     d,
+                                     &one,
+                                     obs,
+                                     d,
+                                     &zero,
+                                     (value_type_t*)NULL,
+                                     n,
+                                     thrust::raw_pointer_cast(obs_copy),
+                                     n,
+                                     stream));
+
+  // Cluster assigned to each observation matrix entry
+  thrust::sequence(thrust_exec_policy, rows, rows + d * n);
+  RAFT_CHECK_CUDA(stream);
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
+                    thrust::modulus<index_type_t>());
+  RAFT_CHECK_CUDA(stream);
+  thrust::gather(
+    thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
+  RAFT_CHECK_CUDA(stream);
+
+  // Row associated with each observation matrix entry
+  thrust::sequence(thrust_exec_policy, rows, rows + d * n);
+  RAFT_CHECK_CUDA(stream);
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
+                    thrust::divides<index_type_t>());
+  RAFT_CHECK_CUDA(stream);
+
+  // Sort and reduce to add observation vectors in same cluster
+  thrust::stable_sort_by_key(thrust_exec_policy,
+                             codes_copy,
+                             codes_copy + d * n,
+                             make_zip_iterator(make_tuple(obs_copy, rows)));
+  RAFT_CHECK_CUDA(stream);
+  thrust::reduce_by_key(thrust_exec_policy,
+                        rows,
+                        rows + d * n,
+                        obs_copy,
+                        codes_copy,  // Output to codes_copy is ignored
+                        thrust::device_pointer_cast(centroids));
+  RAFT_CHECK_CUDA(stream);
+
+  // Divide sums by cluster size to get centroid matrix
+  //
+  // CUDA grid dimensions
+  dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
+
+  // CUDA grid dimensions
+  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+               1};
+
+  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
+  RAFT_CHECK_CUDA(stream);
+
+  return 0;
+}
 
 // =========================================================
 // k-means algorithm
@@ -770,146 +766,146 @@ namespace raft {
  *  @param seed random seed to be used.
  *  @return error flag.
  */
-        template<typename index_type_t, typename value_type_t>
-        int kmeans(handle_t const &handle,
-                   index_type_t n,
-                   index_type_t d,
-                   index_type_t k,
-                   value_type_t tol,
-                   index_type_t maxiter,
-                   const value_type_t *__restrict__ obs,
-                   index_type_t *__restrict__ codes,
-                   index_type_t *__restrict__ clusterSizes,
-                   value_type_t *__restrict__ centroids,
-                   value_type_t *__restrict__ work,
-                   index_type_t *__restrict__ work_int,
-                   value_type_t *residual_host,
-                   index_type_t *iters_host,
-                   unsigned long long seed) {
-            // -------------------------------------------------------
-            // Variable declarations
-            // -------------------------------------------------------
-
-            // Current iteration
-            index_type_t iter;
-
-            constexpr
-            index_type_t grid_lower_bound{65535};
-
-            // Residual sum of squares at previous iteration
-            value_type_t residualPrev = 0;
-
-            // Random number generator
-            thrust::default_random_engine rng(seed);
-            thrust::uniform_real_distribution <value_type_t> uniformDist(0, 1);
-
-            // -------------------------------------------------------
-            // Initialization
-            // -------------------------------------------------------
-
-            auto stream = handle.get_stream();
-            auto cublas_h = handle.get_cublas_handle();
-            auto thrust_exec_policy = handle.get_thrust_policy();
-
-            // Trivial cases
-            if (k == 1) {
-                CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-                CUDA_TRY(
-                        cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
-                if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
-                    WARNING("could not compute k-means centroids");
-
-                dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
-
-                dim3 gridDim{
-                        min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-                        1,
-                        min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
-
-                CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
-                computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
-                RAFT_CHECK_CUDA(stream);
-                *residual_host = thrust::reduce(
-                        thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
-                RAFT_CHECK_CUDA(stream);
-                return 0;
-            }
-            if (n <= k) {
-                thrust::sequence(thrust_exec_policy,
-                                 thrust::device_pointer_cast(codes),
-                                 thrust::device_pointer_cast(codes + n));
-                RAFT_CHECK_CUDA(stream);
-                thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
-                RAFT_CHECK_CUDA(stream);
-
-                if (n < k)
-                    RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
-                RAFT_CUDA_TRY(cudaMemcpyAsync(
-                        centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
-                *residual_host = 0;
-                return 0;
-            }
-
-            // Initialize cuBLAS
-            RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-            // -------------------------------------------------------
-            // k-means++ algorithm
-            // -------------------------------------------------------
-
-            // Choose initial cluster centroids
-            if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
-                WARNING("could not initialize k-means centroids");
-
-            // Apply k-means iteration until convergence
-            for (iter = 0; iter < maxiter; ++iter) {
-                // Update cluster centroids
-                if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
-                    WARNING("could not update k-means centroids");
-
-                // Determine centroid closest to each observation
-                residualPrev = *residual_host;
-                if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
-                    WARNING("could not assign observation vectors to k-means clusters");
-
-                // Reinitialize empty clusters with new centroids
-                index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
-                                                           thrust::device_pointer_cast(clusterSizes),
-                                                           thrust::device_pointer_cast(clusterSizes + k),
-                                                           0) -
-                                              thrust::device_pointer_cast(clusterSizes));
-
-                // FIXME: emptyCentroid never reaches k (infinite loop) under certain
-                // conditions, such as if obs is corrupt (as seen as a result of a
-                // DataFrame column of NULL edge vals used to create the Graph)
-                while (emptyCentroid < k) {
-                    if (chooseNewCentroid(
-                            handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d)))
-                        WARNING("could not replace empty centroid");
-                    if (assignCentroids(
-                            handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
-                        WARNING("could not assign observation vectors to k-means clusters");
-                    emptyCentroid = (thrust::find(thrust_exec_policy,
-                                                  thrust::device_pointer_cast(clusterSizes),
-                                                  thrust::device_pointer_cast(clusterSizes + k),
-                                                  0) -
-                                     thrust::device_pointer_cast(clusterSizes));
-                    RAFT_CHECK_CUDA(stream);
-                }
-
-                // Check for convergence
-                if (std::fabs(residualPrev - (*residual_host)) / n < tol) {
-                    ++iter;
-                    break;
-                }
-            }
-
-            // Warning if k-means has failed to converge
-            if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
-
-            *iters_host = iter;
-            return 0;
-        }
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes,
+           index_type_t* __restrict__ clusterSizes,
+           value_type_t* __restrict__ centroids,
+           value_type_t* __restrict__ work,
+           index_type_t* __restrict__ work_int,
+           value_type_t* residual_host,
+           index_type_t* iters_host,
+           unsigned long long seed)
+{
+  // -------------------------------------------------------
+  // Variable declarations
+  // -------------------------------------------------------
+
+  // Current iteration
+  index_type_t iter;
+
+  constexpr index_type_t grid_lower_bound{65535};
+
+  // Residual sum of squares at previous iteration
+  value_type_t residualPrev = 0;
+
+  // Random number generator
+  thrust::default_random_engine rng(seed);
+  thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
+
+  // -------------------------------------------------------
+  // Initialization
+  // -------------------------------------------------------
+
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
+
+  // Trivial cases
+  if (k == 1) {
+    CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+    CUDA_TRY(
+      cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
+      WARNING("could not compute k-means centroids");
+
+    dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
+
+    dim3 gridDim{
+      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+      1,
+      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
+
+    CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
+    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
+    RAFT_CHECK_CUDA(stream);
+    *residual_host = thrust::reduce(
+      thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
+    RAFT_CHECK_CUDA(stream);
+    return 0;
+  }
+  if (n <= k) {
+    thrust::sequence(thrust_exec_policy,
+                     thrust::device_pointer_cast(codes),
+                     thrust::device_pointer_cast(codes + n));
+    RAFT_CHECK_CUDA(stream);
+    thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
+    RAFT_CHECK_CUDA(stream);
+
+    if (n < k)
+      RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+    *residual_host = 0;
+    return 0;
+  }
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // k-means++ algorithm
+  // -------------------------------------------------------
+
+  // Choose initial cluster centroids
+  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
+    WARNING("could not initialize k-means centroids");
+
+  // Apply k-means iteration until convergence
+  for (iter = 0; iter < maxiter; ++iter) {
+    // Update cluster centroids
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
+      WARNING("could not update k-means centroids");
+
+    // Determine centroid closest to each observation
+    residualPrev = *residual_host;
+    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
+      WARNING("could not assign observation vectors to k-means clusters");
+
+    // Reinitialize empty clusters with new centroids
+    index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
+                                               thrust::device_pointer_cast(clusterSizes),
+                                               thrust::device_pointer_cast(clusterSizes + k),
+                                               0) -
+                                  thrust::device_pointer_cast(clusterSizes));
+
+    // FIXME: emptyCentroid never reaches k (infinite loop) under certain
+    // conditions, such as if obs is corrupt (as seen as a result of a
+    // DataFrame column of NULL edge vals used to create the Graph)
+    while (emptyCentroid < k) {
+      if (chooseNewCentroid(
+            handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d)))
+        WARNING("could not replace empty centroid");
+      if (assignCentroids(
+            handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
+        WARNING("could not assign observation vectors to k-means clusters");
+      emptyCentroid = (thrust::find(thrust_exec_policy,
+                                    thrust::device_pointer_cast(clusterSizes),
+                                    thrust::device_pointer_cast(clusterSizes + k),
+                                    0) -
+                       thrust::device_pointer_cast(clusterSizes));
+      RAFT_CHECK_CUDA(stream);
+    }
+
+    // Check for convergence
+    if (std::fabs(residualPrev - (*residual_host)) / n < tol) {
+      ++iter;
+      break;
+    }
+  }
+
+  // Warning if k-means has failed to converge
+  if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
+
+  *iters_host = iter;
+  return 0;
+}
 
 /**
  *  @brief Find clusters with k-means algorithm.
@@ -936,50 +932,51 @@ namespace raft {
  *  @param seed random seed to be used.
  *  @return error flag
  */
-        template<typename index_type_t, typename value_type_t>
-        int kmeans(handle_t const &handle,
-                   index_type_t n,
-                   index_type_t d,
-                   index_type_t k,
-                   value_type_t tol,
-                   index_type_t maxiter,
-                   const value_type_t *__restrict__ obs,
-                   index_type_t *__restrict__ codes,
-                   value_type_t &residual,
-                   index_type_t &iters,
-                   unsigned long long seed = 123456) {
-            using namespace matrix;
-
-            // Check that parameters are valid
-            RAFT_EXPECTS(n > 0, "invalid parameter (n<1)");
-            RAFT_EXPECTS(d > 0, "invalid parameter (d<1)");
-            RAFT_EXPECTS(k > 0, "invalid parameter (k<1)");
-            RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)");
-            RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)");
-
-            // Allocate memory
-            vector_t <index_type_t> clusterSizes(handle, k);
-            vector_t <value_type_t> centroids(handle, d * k);
-            vector_t <value_type_t> work(handle, n * max(k, d));
-            vector_t <index_type_t> work_int(handle, 2 * d * n);
-
-            // Perform k-means
-            return kmeans<index_type_t, value_type_t>(handle,
-                                                      n,
-                                                      d,
-                                                      k,
-                                                      tol,
-                                                      maxiter,
-                                                      obs,
-                                                      codes,
-                                                      clusterSizes.raw(),
-                                                      centroids.raw(),
-                                                      work.raw(),
-                                                      work_int.raw(),
-                                                      &residual,
-                                                      &iters,
-                                                      seed);
-        }
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes,
+           value_type_t& residual,
+           index_type_t& iters,
+           unsigned long long seed = 123456)
+{
+  using namespace matrix;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(n > 0, "invalid parameter (n<1)");
+  RAFT_EXPECTS(d > 0, "invalid parameter (d<1)");
+  RAFT_EXPECTS(k > 0, "invalid parameter (k<1)");
+  RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)");
+  RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)");
+
+  // Allocate memory
+  vector_t<index_type_t> clusterSizes(handle, k);
+  vector_t<value_type_t> centroids(handle, d * k);
+  vector_t<value_type_t> work(handle, n * max(k, d));
+  vector_t<index_type_t> work_int(handle, 2 * d * n);
+
+  // Perform k-means
+  return kmeans<index_type_t, value_type_t>(handle,
+                                            n,
+                                            d,
+                                            k,
+                                            tol,
+                                            maxiter,
+                                            obs,
+                                            codes,
+                                            clusterSizes.raw(),
+                                            centroids.raw(),
+                                            work.raw(),
+                                            work_int.raw(),
+                                            &residual,
+                                            &iters,
+                                            seed);
+}
 
 }  // namespace detail
 }  // namespace cluster
diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp
index fe4dd4793a..a6c824afe4 100644
--- a/cpp/include/raft/cluster/kmeans.hpp
+++ b/cpp/include/raft/cluster/kmeans.hpp
@@ -45,22 +45,21 @@ namespace cluster {
  *  @param seed random seed to be used.
  *  @return error flag
  */
-template<typename index_type_t, typename value_type_t>
-int kmeans(handle_t const &handle,
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
            value_type_t tol,
            index_type_t maxiter,
-           const value_type_t *__restrict__ obs,
-           index_type_t *__restrict__ codes,
-           value_type_t &residual,
-           index_type_t &iters,
-           unsigned long long seed = 123456) {
-
-    return detail::kmeans<index_type_t, value_type_t>(handle, n, d, k, tol,
-                                                      maxiter, obs, codes,
-                                                      residual, iters, seed);
+           const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes,
+           value_type_t& residual,
+           index_type_t& iters,
+           unsigned long long seed = 123456)
+{
+  return detail::kmeans<index_type_t, value_type_t>(
+    handle, n, d, k, tol, maxiter, obs, codes, residual, iters, seed);
 }
 }  // namespace cluster
 }  // namespace raft
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 09a767bea7..d83e8f4d4f 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -18,7 +18,6 @@
 
 #include <raft/comms/std_comms.hpp>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <iostream>
 #include <nccl.h>
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index f54535a88c..b4aa72d53e 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -21,8 +21,6 @@
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/std_comms.hpp>
 
-#include <raft/mr/device/buffer.hpp>
-
 #include <iostream>
 #include <nccl.h>
 #include <ucp/api/ucp.h>
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index a4d0cae359..f254c8f228 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -19,7 +19,7 @@
 #include <raft/label/detail/classlabels.cuh>
 
 namespace raft {
-    namespace label {
+namespace label {
 
 /**
  * Get unique class labels.
@@ -36,8 +36,9 @@ namespace raft {
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream) {
-    detail::getUniqueLabels<value_t>(unique, y, n, stream);
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
+  return detail::getUniquelabels<value_t>(unique, y, n, stream);
 }
 
 /**
@@ -60,8 +61,9 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
  */
 template <typename value_t>
 void getOvrlabels(
-        value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) {
-    detail::getOvrLabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
+  detail::getOvrlabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
 }
 /**
  * Maps an input array containing a series of numbers into a new array
@@ -84,9 +86,9 @@ void getOvrlabels(
  */
 template <typename Type, typename Lambda>
 void make_monotonic(
-        Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
 {
-    detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
+  detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
 }
 
 /**
@@ -109,7 +111,7 @@ void make_monotonic(
 template <typename Type>
 void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
 {
-    detail::make_monotonic<Type>(out, in, N, stream, zero_based);
+  detail::make_monotonic<Type>(out, in, N, stream, zero_based);
 }
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
index a3f2411102..3602a29e59 100644
--- a/cpp/include/raft/label/detail/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -25,6 +25,7 @@
 
 namespace raft {
 namespace label {
+namespace detail {
 
 /** Note: this is one possible implementation where we represent the label
  *  equivalence graph implicitly using labels_a, labels_b and mask.
@@ -153,5 +154,6 @@ void merge_labels(value_idx* labels_a,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+}  // namespace detail
 };  // namespace label
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
index b21502a06a..5ba8fe8a27 100644
--- a/cpp/include/raft/label/merge_labels.hpp
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -20,7 +20,6 @@
 
 namespace raft {
 namespace label {
-namespace detail {
 
 /**
  * @brief Merge two labellings in-place, according to a core mask
@@ -51,18 +50,17 @@ namespace detail {
  * @param[in]    N           Number of points in the dataset
  * @param[in]    stream      CUDA stream
  */
-template<typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx *labels_a,
-                  const value_idx *labels_b,
-                  const bool *mask,
-                  value_idx *R,
-                  bool *m,
+template <typename value_idx = int, int TPB_X = 256>
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
                   value_idx N,
-                  cudaStream_t stream) {
-
-    detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
+                  cudaStream_t stream)
+{
+  detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
 }
 
-};  // namespace detail
 };  // namespace label
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/lap/detail/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh
index ab4aa2df59..6c6b09e5d8 100644
--- a/cpp/include/raft/lap/detail/lap_functions.cuh
+++ b/cpp/include/raft/lap/detail/lap_functions.cuh
@@ -28,7 +28,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/lap/lap_kernels.cuh>
+#include <raft/lap/detail/lap_kernels.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/lap/detail/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh
index 328cbf3e74..b61d0bd269 100644
--- a/cpp/include/raft/lap/detail/lap_kernels.cuh
+++ b/cpp/include/raft/lap/detail/lap_kernels.cuh
@@ -28,7 +28,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <thrust/for_each.h>
 
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 9376994742..7b7aa0decf 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -28,9 +28,9 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/cublas_wrappers.h>
-#include <raft/spectral/lapack.hpp>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include <raft/spectral/detail/lapack.hpp>
+#include <raft/spectral/detail/matrix_wrappers.hpp>
+#include <raft/spectral/detail/warn_dbg.hpp>
 
 namespace raft {
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
deleted file mode 100644
index 6998c1f186..0000000000
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cudart_utils.h>
-
-#include <cuda_runtime.h>
-
-#include <cstddef>
-#include <memory>
-#include <utility>
-
-namespace raft {
-namespace mr {
-
-/**
- * @brief Base for all RAII-based owning of temporary memory allocations. This
- *        class should ideally not be used by users directly, but instead via
- *        the child classes `device_buffer` and `host_buffer`.
- *
- * @tparam T          data type
- * @tparam AllocatorT The underly allocator object
- */
-template <typename T, typename AllocatorT>
-class buffer_base {
- public:
-  using size_type       = std::size_t;
-  using value_type      = T;
-  using iterator        = value_type*;
-  using const_iterator  = const value_type*;
-  using reference       = T&;
-  using const_reference = const T&;
-
-  buffer_base() = delete;
-
-  buffer_base(const buffer_base& other) = delete;
-
-  buffer_base& operator=(const buffer_base& other) = delete;
-
-  /**
-   * @brief Main ctor
-   *
-   * @param[in] allocator asynchronous allocator used for managing buffer life
-   * @param[in] stream    cuda stream where this allocation operations are async
-   * @param[in] n         size of the buffer (in number of elements)
-   */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
-    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
-  {
-    if (capacity_ > 0) {
-      data_ =
-        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
-      RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
-    }
-  }
-
-  ~buffer_base() { release(); }
-
-  value_type* data() { return data_; }
-
-  const value_type* data() const { return data_; }
-
-  size_type size() const { return size_; }
-
-  void clear() { size_ = 0; }
-
-  iterator begin() { return data_; }
-
-  const_iterator begin() const { return data_; }
-
-  iterator end() { return data_ + size_; }
-
-  const_iterator end() const { return data_ + size_; }
-
-  /**
-   * @brief Reserve new memory size for this buffer.
-   *
-   * It re-allocates a fresh buffer if the new requested capacity is more than
-   * the current one, copies the old buffer contents to this new buffer and
-   * removes the old one.
-   *
-   * @param[in] new_capacity new capacity (in number of elements)
-   * @{
-   */
-  void reserve(size_type new_capacity)
-  {
-    if (new_capacity > capacity_) {
-      auto* new_data =
-        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
-      // Only deallocate if we have allocated a pointer
-      if (nullptr != data_) {
-        allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
-      }
-      data_     = new_data;
-      capacity_ = new_capacity;
-    }
-  }
-
-  void reserve(size_type new_capacity, cudaStream_t stream)
-  {
-    set_stream(stream);
-    reserve(new_capacity);
-  }
-  /** @} */
-
-  /**
-   * @brief Resize the underlying buffer (uses `reserve` method internally)
-   *
-   * @param[in] new_size new buffer size
-   * @{
-   */
-  void resize(const size_type new_size)
-  {
-    reserve(new_size);
-    size_ = new_size;
-  }
-
-  void resize(const size_type new_size, cudaStream_t stream)
-  {
-    set_stream(stream);
-    resize(new_size);
-  }
-  /** @} */
-
-  /**
-   * @brief Deletes the underlying buffer
-   *
-   * If this method is not explicitly called, it will be during the destructor
-   * @{
-   */
-  void release()
-  {
-    if (nullptr != data_) {
-      allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
-    }
-    data_     = nullptr;
-    capacity_ = 0;
-    size_     = 0;
-  }
-
-  void release(cudaStream_t stream)
-  {
-    set_stream(stream);
-    release();
-  }
-  /** @} */
-
-  /**
-   * @brief returns the underlying allocator used
-   *
-   * @return the allocator pointer
-   */
-  std::shared_ptr<AllocatorT> get_allocator() const { return allocator_; }
-
-  /**
-   * @brief returns the underlying stream used
-   *
-   * @return the cuda stream
-   */
-  cudaStream_t get_stream() const { return stream_; }
-
- protected:
-  value_type* data_;
-
- private:
-  size_type size_;
-  size_type capacity_;
-  cudaStream_t stream_;
-  std::shared_ptr<AllocatorT> allocator_;
-
-  /**
-   * @brief Sets a new cuda stream where the future operations will be queued
-   *
-   * This method makes sure that the inter-stream dependencies are met and taken
-   * care of, before setting the input stream as a new stream for this buffer.
-   * Ideally, the same cuda stream passed during constructor is expected to be
-   * used throughout this buffer's lifetime, for performance.
-   *
-   * @param[in] stream new cuda stream to be set. If it is the same as the
-   *                   current one, then this method will be a no-op.
-   */
-  void set_stream(cudaStream_t stream)
-  {
-    if (stream_ != stream) {
-      cudaEvent_t event;
-      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-      RAFT_CUDA_TRY(cudaEventRecord(event, stream_));
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
-      stream_ = stream;
-      RAFT_CUDA_TRY(cudaEventDestroy(event));
-    }
-  }
-};  // class buffer_base
-
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
deleted file mode 100644
index 9b5ff11c50..0000000000
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "allocator.hpp"
-#include <memory>
-#include <raft/mr/buffer_base.hpp>
-
-namespace raft {
-namespace mr {
-namespace device {
-
-/**
- * @brief RAII object owning a contiguous typed device buffer. The passed in
- *        allocator supports asynchronous allocation and deallocation so this
- *        can also be used for temporary memory
- *
- * @code{.cpp}
- * template<typename T>
- * void foo(..., cudaStream_t stream) {
- *   ...
- *   raft::mr::device::buffer<T> temp(stream, 0);
- *   ...
- *   temp.resize(n);
- *   kernelA<<<grid,block,0,stream>>>(...,temp.data(),...);
- *   kernelB<<<grid,block,0,stream>>>(...,temp.data(),...);
- *   temp.release();
- *   ...
- * }
- * @endcode
- */
-template <typename T>
-class buffer : public buffer_base<T, allocator> {
- public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
-  using const_reference = typename buffer_base<T, allocator>::const_reference;
-
-  buffer() = delete;
-
-  buffer(const buffer& other) = delete;
-
-  buffer& operator=(const buffer& other) = delete;
-
-  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n)
-  {
-  }
-};  // class buffer
-
-};  // namespace device
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
deleted file mode 100644
index 204b384719..0000000000
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "allocator.hpp"
-#include <memory>
-#include <raft/mr/buffer_base.hpp>
-#include <raft/mr/device/buffer.hpp>
-
-namespace raft {
-namespace mr {
-namespace host {
-
-/**
- * @brief RAII object owning a contigous typed host buffer (aka pinned memory).
- *        The passed in allocator supports asynchronus allocation and
- *        deallocation so this can also be used for temporary memory
- *
- * @code{.cpp}
- * template<typename T>
- * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) {
- *   ...
- *   raft::mr::host::buffer<T> temp(stream, 0);
- *   ...
- *   temp.resize(n);
- *   raft::copy(temp.data(), in_d, temp.size());
- *   ...
- *   raft::copy(out_d, temp.data(), temp.size());
- *   temp.release(stream);
- *   ...
- * }
- * @endcode
- */
-template <typename T>
-class buffer : public buffer_base<T, allocator> {
- public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
-  using const_reference = typename buffer_base<T, allocator>::const_reference;
-
-  buffer() = delete;
-
-  buffer(const buffer& other) = delete;
-
-  buffer& operator=(const buffer& other) = delete;
-
-  buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
-  {
-    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
-  }
-
-  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n)
-  {
-  }
-
-  reference operator[](size_type pos) { return data_[pos]; }
-
-  const_reference operator[](size_type pos) const { return data_[pos]; }
-
- private:
-  using buffer_base<T, allocator>::data_;
-};
-
-};  // namespace host
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index c23a2b1537..cb9e67dd6b 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -21,7 +21,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include "../../csr.hpp"
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index 8c01b33c1e..06c034ad9f 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 2f121dce33..aee53ac37a 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -21,7 +21,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/distance_type.h>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <raft/sparse/convert/coo.hpp>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 4e78494e6b..593702b4a7 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -19,7 +19,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 5d4640f4a6..548ca00c93 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -24,7 +24,6 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/linalg/distance_type.h>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/common.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 7173c76c08..10e9d04c0d 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -19,7 +19,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/op/sort.hpp>
 #include <raft/sparse/selection/connect_components.hpp>
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 2519c1914e..33e62d1374 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -17,6 +17,8 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/cuda_utils.cuh>
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 074a139ba9..715bbe6deb 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <raft/device_atomics.cuh>
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 817b9782f2..5ef7371492 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -17,9 +17,8 @@
 #include <cub/cub.cuh>
 
 #include <raft/distance/fused_l2_nn.hpp>
-#include <raft/label/classlabels.cuh>
+#include <raft/label/classlabels.hpp>
 #include <raft/linalg/norm.cuh>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/linalg/symmetrize.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index de0a15c029..b8dc25053b 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -23,7 +23,6 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/csr.hpp>
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 6ce9463e43..5f64a8d1b5 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -22,8 +22,6 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
-#include <raft/mr/device/buffer.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index b29c4cc51c..59df75ba36 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -19,14 +19,10 @@
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
-#include <raft/mr/device/buffer.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
-
 /**
  * Performs a k-select across row partitioned index/distance
  * matrices formatted like the following:
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 092e51a17d..cc25e66cae 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -55,16 +55,16 @@ struct kmeans_solver_t {
     index_type_t iters{};
 
     raft::cluster::kmeans(handle,
-           n_obs_vecs,
-           dim,
-           config_.n_clusters,
-           config_.tol,
-           config_.maxIter,
-           obs,
-           codes,
-           residual,
-           iters,
-           config_.seed);
+                          n_obs_vecs,
+                          dim,
+                          config_.n_clusters,
+                          config_.tol,
+                          config_.maxIter,
+                          obs,
+                          codes,
+                          residual,
+                          iters,
+                          config_.seed);
     return std::make_pair(residual, iters);
   }
 
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index ec0e23dfe0..08e104dea6 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -27,8 +27,8 @@
 #include <tuple>
 
 #include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/detail/spectral_util.hpp>
 #include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/spectral_util.hpp>
 
 #ifdef COLLECT_TIME_STATISTICS
 #include <cuda_profiler_api.h>
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 82313f84ee..99d6f3e419 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -26,8 +26,8 @@
 #include <tuple>
 
 #include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
 #include <raft/spectral/detail/spectral_util.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
 
 namespace raft {
 namespace spectral {
@@ -69,7 +69,8 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
                                                    ClusterSolver const& cluster_solver,
                                                    vertex_t* __restrict__ clusters,
                                                    weight_t* eigVals,
-                                                   weight_t* eigVecs) {
+                                                   weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
@@ -135,7 +136,8 @@ void analyzePartition(handle_t const& handle,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
-                      weight_t& cost) {
+                      weight_t& cost)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index 1f8857200f..466851c74f 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -50,15 +50,16 @@ namespace spectral {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-        handle_t const& handle,
-        sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-        EigenSolver const& eigen_solver,
-        ClusterSolver const& cluster_solver,
-        vertex_t* __restrict__ clusters,
-        weight_t* eigVals,
-        weight_t* eigVecs) {
-    return detail::modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(handle, csr_m,
-                                                                                           eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+  handle_t const& handle,
+  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return detail::modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 //===================================================
 // Analysis of graph partition
@@ -76,9 +77,9 @@ void analyzeModularity(handle_t const& handle,
                        sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
-                       weight_t& modularity) {
-    detail::analyzeModularity<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, modularity);
-}
+                       weight_t& modularity)
+{
+  detail::analyzeModularity<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, modularity);
 }
 
 }  // namespace spectral
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index ca2b71ae49..597ef530a2 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -55,10 +55,10 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
                                                    ClusterSolver const& cluster_solver,
                                                    vertex_t* __restrict__ clusters,
                                                    weight_t* eigVals,
-                                                   weight_t* eigVecs) {
-    return detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(handle, csr_m, eigen_solver,
-                                                                             cluster_solver, clusters,
-                                                                             eigVals, eigVecs);
+                                                   weight_t* eigVecs)
+{
+  return detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 
 // =========================================================
@@ -85,8 +85,9 @@ void analyzePartition(handle_t const& handle,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
-                      weight_t& cost) {
-    detail::analyzePartition<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, edgeCut, cost);
+                      weight_t& cost)
+{
+  detail::analyzePartition<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, edgeCut, cost);
 }
 
 }  // namespace spectral
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 07f04ad2ab..ac216d0a2c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -64,8 +64,6 @@ add_executable(test_raft
     test/matrix/math.cu
     test/matrix/matrix.cu
     test/matrix/linewise_op.cu
-    test/mr/device/buffer.cpp
-    test/mr/host/buffer.cpp
     test/mst.cu
     test/random/rng.cu
     test/random/rng_int.cu
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 2c7996514a..0030596e21 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -19,9 +19,11 @@
 #include <memory>
 #include <raft/handle.hpp>
 
+#include <raft/spectral/cluster_solvers.hpp>
 #include <raft/spectral/modularity_maximization.hpp>
 
 namespace raft {
+namespace spectral {
 
 TEST(Raft, ClusterSolvers)
 {
@@ -60,7 +62,12 @@ TEST(Raft, ModularitySolvers)
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(0,
+            h.
+
+            get_device()
+
+  );
 
   index_type neigvs{10};
   index_type maxiter{100};
@@ -95,4 +102,5 @@ TEST(Raft, ModularitySolvers)
   EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
 }
 
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index f898d11d2e..541d4dccc8 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -16,6 +16,7 @@
 
 #include <raft/common/nvtx.hpp>
 #include <raft/handle.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
 #include <raft/spectral/partition.hpp>
 
 #include <gtest/gtest.h>
@@ -25,6 +26,7 @@
 #include <memory>
 
 namespace raft {
+namespace spectral {
 
 TEST(Raft, EigenSolvers)
 {
@@ -34,7 +36,12 @@ TEST(Raft, EigenSolvers)
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(0,
+            h.
+
+            get_device()
+
+  );
 
   index_type* ro{nullptr};
   index_type* ci{nullptr};
@@ -75,7 +82,12 @@ TEST(Raft, SpectralSolvers)
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(0,
+            h.
+
+            get_device()
+
+  );
 
   index_type neigvs{10};
   index_type maxiter{100};
@@ -109,4 +121,5 @@ TEST(Raft, SpectralSolvers)
   EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
 }
 
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index d441bf95a8..b19accc3b4 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/label/classlabels.cuh>
+#include <raft/label/classlabels.hpp>
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 5d30af795f..db6b34bbd6 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/label/merge_labels.cuh>
+#include <raft/label/merge_labels.hpp>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index afdebae1f8..24e1c6be4f 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -28,7 +28,7 @@
 
 #include <iostream>
 #include <omp.h>
-#include <raft/lap/lap.cuh>
+#include <raft/lap/lap.hpp>
 #include <random>
 
 #define PROBLEMSIZE  1000  // Number of rows/columns
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
deleted file mode 100644
index 4861a4ca1f..0000000000
--- a/cpp/test/mr/device/buffer.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <raft/cudart_utils.h>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/limiting_resource_adaptor.hpp>
-
-namespace raft {
-namespace mr {
-namespace device {
-
-TEST(Raft, DeviceBufferAlloc)
-{
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  // no allocation at construction
-  rmm::device_uvector<char> buff(0, stream);
-  ASSERT_EQ(0, buff.size());
-  // explicit allocation after construction
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  // resizing to a smaller buffer size
-  buff.resize(10, stream);
-  ASSERT_EQ(10, buff.size());
-  // explicit deallocation
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-  // use these methods without the explicit stream parameter
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  buff.resize(10, stream);
-  ASSERT_EQ(10, buff.size());
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-TEST(Raft, DeviceBufferZeroResize)
-{
-  // Create a limiting_resource_adaptor to track allocations
-  auto curr_mr =
-    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
-  auto limit_mr =
-    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
-                                                                                        1000);
-
-  rmm::mr::set_current_device_resource(limit_mr.get());
-
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  // no allocation at construction
-  rmm::device_uvector<char> buff(10, stream);
-  ASSERT_EQ(10, buff.size());
-  // explicit allocation after construction
-  buff.resize(0, stream);
-  ASSERT_EQ(0, buff.size());
-  // resizing to a smaller buffer size
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  // explicit deallocation
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-
-  // Now check that there is no memory left. (Used to not be true)
-  ASSERT_EQ(0, limit_mr->get_allocated_bytes());
-
-  rmm::mr::set_current_device_resource(curr_mr);
-
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-}  // namespace device
-}  // namespace mr
-}  // namespace raft
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
deleted file mode 100644
index d645ffa0e0..0000000000
--- a/cpp/test/mr/host/buffer.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <raft/mr/device/buffer.hpp>
-#include <raft/mr/host/buffer.hpp>
-
-namespace raft {
-namespace mr {
-namespace host {
-
-TEST(Raft, HostBuffer)
-{
-  auto alloc = std::make_shared<default_allocator>();
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  // no allocation at construction
-  buffer<char> buff(alloc, stream);
-  ASSERT_EQ(0, buff.size());
-  // explicit allocation after construction
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  // resizing to a smaller buffer size
-  buff.resize(10, stream);
-  ASSERT_EQ(10, buff.size());
-  // explicit deallocation
-  buff.release(stream);
-  ASSERT_EQ(0, buff.size());
-  // use these methods without the explicit stream parameter
-  buff.resize(20);
-  ASSERT_EQ(20, buff.size());
-  buff.resize(10);
-  ASSERT_EQ(10, buff.size());
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-TEST(Raft, DeviceToHostBuffer)
-{
-  auto d_alloc = std::make_shared<device::default_allocator>();
-  auto h_alloc = std::make_shared<default_allocator>();
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  device::buffer<char> d_buff(d_alloc, stream, 32);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
-  buffer<char> h_buff(h_alloc, d_buff);
-  ASSERT_EQ(d_buff.size(), h_buff.size());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-}  // namespace host
-}  // namespace mr
-}  // namespace raft
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index fa54b04cda..6a5a2c9f59 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -19,7 +19,7 @@
 #include <memory>
 #include <raft/handle.hpp>
 
-#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/detail/matrix_wrappers.hpp>
 
 namespace raft {
 namespace {

From 32951b2982f2c478bef4055a34e13c4643ba1314 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Feb 2022 18:47:22 -0500
Subject: [PATCH 3/8] Updates

---
 .../cluster/detail/{kmeans.hpp => kmeans.cuh} |  2 +-
 cpp/include/raft/cluster/kmeans.hpp           |  2 +-
 cpp/include/raft/linalg/lanczos.hpp           |  2 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |  5 ++-
 ...atrix_wrappers.hpp => matrix_wrappers.cuh} |  0
 .../detail/modularity_maximization.hpp        |  2 +-
 .../raft/spectral/detail/partition.hpp        |  2 +-
 .../{spectral_util.hpp => spectral_util.cuh}  | 40 ++++++++++---------
 cpp/test/spectral_matrix.cu                   |  2 +-
 9 files changed, 30 insertions(+), 27 deletions(-)
 rename cpp/include/raft/cluster/detail/{kmeans.hpp => kmeans.cuh} (99%)
 rename cpp/include/raft/spectral/detail/{matrix_wrappers.hpp => matrix_wrappers.cuh} (100%)
 rename cpp/include/raft/spectral/detail/{spectral_util.hpp => spectral_util.cuh} (86%)

diff --git a/cpp/include/raft/cluster/detail/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.cuh
similarity index 99%
rename from cpp/include/raft/cluster/detail/kmeans.hpp
rename to cpp/include/raft/cluster/detail/kmeans.cuh
index e5911bc7d9..009d1a0642 100644
--- a/cpp/include/raft/cluster/detail/kmeans.hpp
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -32,7 +32,7 @@
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/cublas_wrappers.h>
-#include <raft/spectral/detail/matrix_wrappers.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
 #include <raft/spectral/detail/warn_dbg.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp
index a6c824afe4..ab0fbb04c7 100644
--- a/cpp/include/raft/cluster/kmeans.hpp
+++ b/cpp/include/raft/cluster/kmeans.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/cluster/detail/kmeans.hpp>
+#include <raft/cluster/detail/kmeans.cuh>
 
 namespace raft {
 namespace cluster {
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 7b7aa0decf..b8da623418 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -29,7 +29,7 @@
 #include <raft/handle.hpp>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/spectral/detail/lapack.hpp>
-#include <raft/spectral/detail/matrix_wrappers.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
 #include <raft/spectral/detail/warn_dbg.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 33e62d1374..956ecb53a4 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -71,11 +71,12 @@ void fit_embedding(const raft::handle_t& handle,
   value_type tol          = 0.01;
   index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter, restart_iter, tol};
+  raft::spectral::eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol};
 
   cfg.seed = seed;
 
-  raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+  raft::spectral::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
   // cluster computation here is irrelevant,
   // hence define a no-op such solver to
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
similarity index 100%
rename from cpp/include/raft/spectral/detail/matrix_wrappers.hpp
rename to cpp/include/raft/spectral/detail/matrix_wrappers.cuh
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index 08e104dea6..a55dfbe67f 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -27,7 +27,7 @@
 #include <tuple>
 
 #include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/detail/spectral_util.hpp>
+#include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.hpp>
 
 #ifdef COLLECT_TIME_STATISTICS
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 99d6f3e419..b7c811d5a5 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -26,7 +26,7 @@
 #include <tuple>
 
 #include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/detail/spectral_util.hpp>
+#include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spectral/detail/spectral_util.hpp b/cpp/include/raft/spectral/detail/spectral_util.cuh
similarity index 86%
rename from cpp/include/raft/spectral/detail/spectral_util.hpp
rename to cpp/include/raft/spectral/detail/spectral_util.cuh
index a30906de10..df26676f99 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.hpp
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -132,7 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::minus<weight_t>());
     RAFT_CHECK_CUDA(stream);
 
-    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -149,22 +149,22 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                               CUBLAS_OP_T,
-                               CUBLAS_OP_N,
-                               nEigVecs,
-                               n,
-                               &one,
-                               eigVecs,
-                               n,
-                               &zero,
-                               (weight_t*)NULL,
-                               nEigVecs,
-                               work.raw(),
-                               nEigVecs,
-                               stream));
+    RAFT_CUBLAS_TRY(raft::linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h,
+                                             CUBLAS_OP_T,
+                                             CUBLAS_OP_N,
+                                             nEigVecs,
+                                             n,
+                                             &one,
+                                             eigVecs,
+                                             n,
+                                             &zero,
+                                             (weight_t*)NULL,
+                                             nEigVecs,
+                                             work.raw(),
+                                             nEigVecs,
+                                             stream));
 
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
@@ -216,14 +216,16 @@ bool construct_indicator(handle_t const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+  RAFT_CUBLAS_TRY(
+    raft::linalg::cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
   if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  RAFT_CUBLAS_TRY(
+    raft::linalg::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 6a5a2c9f59..652aa61451 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -19,7 +19,7 @@
 #include <memory>
 #include <raft/handle.hpp>
 
-#include <raft/spectral/detail/matrix_wrappers.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
 
 namespace raft {
 namespace {

From 71710b9ee12c0233259a4bdee0610d3629b2b5c5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 1 Feb 2022 19:55:12 -0500
Subject: [PATCH 4/8] Fixing classlabels doc

---
 cpp/include/raft/label/classlabels.hpp | 27 +++++++++++++-------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index f254c8f228..8cd394facc 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -27,13 +27,12 @@ namespace label {
  * The y array is assumed to store class labels. The unique values are selected
  * from this array.
  *
- * \tparam value_t numeric type of the arrays with class labels
- * \param [in] y device array of labels, size [n]
- * \param [in] n number of labels
- * \param [out] unique device array of unique labels, unallocated on entry,
- *   on exit it has size [n_unique]
- * \param [out] n_unique number of unique labels
- * \param [in] stream cuda stream
+ * @tparam value_t numeric type of the arrays with class labels
+ * @param [in] y device array of labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] stream cuda stream
+ * @returns unique device array of unique labels, unallocated on entry,
+ *   on exit it has size
  */
 template <typename value_t>
 int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
@@ -51,13 +50,13 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
  * free to choose other type for y_out (it should represent +/-1, and it is used
  * in floating point arithmetics).
  *
- * \param [in] y device array if input labels, size [n]
- * \param [in] n number of labels
- * \param [in] y_unique device array of unique labels, size [n_classes]
- * \param [in] n_classes number of unique labels
- * \param [out] y_out device array of output labels
- * \param [in] idx index of unique label that should be labeled as 1
- * \param [in] stream cuda stream
+ * @param [in] y device array if input labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] y_unique device array of unique labels, size [n_classes]
+ * @param [in] n_classes number of unique labels
+ * @param [out] y_out device array of output labels
+ * @param [in] idx index of unique label that should be labeled as 1
+ * @param [in] stream cuda stream
  */
 template <typename value_t>
 void getOvrlabels(

From 15943918dc31c56e49e98a6156958bad59023f2e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 4 Feb 2022 18:56:11 -0500
Subject: [PATCH 5/8] Fixing doc

---
 cpp/include/raft/label/classlabels.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index 8cd394facc..cc7afd9780 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -30,6 +30,7 @@ namespace label {
  * @tparam value_t numeric type of the arrays with class labels
  * @param [in] y device array of labels, size [n]
  * @param [in] n number of labels
+ * @param [inout] unique output unique labels
  * @param [in] stream cuda stream
  * @returns unique device array of unique labels, unallocated on entry,
  *   on exit it has size

From 0f1d128ed077c29267a268a1278788ec118cb48c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 4 Feb 2022 18:56:27 -0500
Subject: [PATCH 6/8] ng unique where it belongs

---
 cpp/include/raft/label/classlabels.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index cc7afd9780..54e00e1f4f 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -28,9 +28,9 @@ namespace label {
  * from this array.
  *
  * @tparam value_t numeric type of the arrays with class labels
+ * @param [inout] unique output unique labels
  * @param [in] y device array of labels, size [n]
  * @param [in] n number of labels
- * @param [inout] unique output unique labels
  * @param [in] stream cuda stream
  * @returns unique device array of unique labels, unallocated on entry,
  *   on exit it has size

From 55ab6dc7bf15aa29c7c97d927314e7f8923d028a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 7 Feb 2022 14:11:46 -0500
Subject: [PATCH 7/8] Adding missing doxygen argument

---
 cpp/include/raft/label/classlabels.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index 54e00e1f4f..de9f60518d 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -77,12 +77,13 @@ void getOvrlabels(
  * @tparam Type the numeric type of the input and output arrays
  * @tparam Lambda the type of an optional filter function, which determines
  * which items in the array to map.
- * @param out the output monotonic array
- * @param in input label array
- * @param N number of elements in the input array
- * @param stream cuda stream to use
- * @param filter_op an optional function for specifying which values
+ * @param[out] out the output monotonic array
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] filter_op an optional function for specifying which values
  * should have monotonically increasing labels applied to them.
+ * @param[in] zero_based force monotonic set to start at 0?
  */
 template <typename Type, typename Lambda>
 void make_monotonic(
@@ -101,12 +102,11 @@ void make_monotonic(
  * where a set of vertices need to be labeled in a monotonically increasing
  * order.
  * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param out output label array with labels assigned monotonically
- * @param in input label array
- * @param N number of elements in the input array
- * @param stream cuda stream to use
+ * @param[out] out output label array with labels assigned monotonically
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] zero_based force monotonic label set to start at 0?
  */
 template <typename Type>
 void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)

From 36948fd552adac2faba427c09fe8cd9d54e312c5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Feb 2022 18:32:10 -0500
Subject: [PATCH 8/8] Updating w/ branch-22.04

---
 cpp/include/raft/cluster/detail/kmeans.cuh | 19 +------------------
 cpp/include/raft/linalg/detail/lanczos.hpp |  6 +++---
 2 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 79337011bf..5f1a0e137d 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -31,9 +31,9 @@
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/detail/matrix_wrappers.cuh>
 #include <raft/spectral/detail/warn_dbg.hpp>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 namespace raft {
 namespace cluster {
@@ -656,22 +656,6 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-<<<<<<< HEAD:cpp/include/raft/cluster/detail/kmeans.cuh
-  RAFT_CUBLAS_TRY(linalg::cublasgeam(cublas_h,
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     n,
-                                     d,
-                                     &one,
-                                     obs,
-                                     d,
-                                     &zero,
-                                     (value_type_t*)NULL,
-                                     n,
-                                     thrust::raw_pointer_cast(obs_copy),
-                                     n,
-                                     stream));
-=======
   // #TODO: Call from public API when ready
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
                                                    CUBLAS_OP_T,
@@ -687,7 +671,6 @@ static int updateCentroids(handle_t const& handle,
                                                    thrust::raw_pointer_cast(obs_copy),
                                                    n,
                                                    stream));
->>>>>>> rapidsai/branch-22.04:cpp/include/raft/spectral/kmeans.hpp
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index 3d8fde7e68..a2b7751a05 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -28,9 +28,9 @@
 #include "cublas_wrappers.hpp"
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/spectral/lapack.hpp>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include <raft/spectral/detail/lapack.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
+#include <raft/spectral/detail/warn_dbg.hpp>
 
 namespace raft {