diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
deleted file mode 100644
index 8b65da69aa..0000000000
--- a/.github/workflows/stale.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Mark inactive issues and pull requests
-
-on:
-  schedule:
-    - cron: "0 * * * *"
-
-jobs:
-  mark-inactive-30d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 30 day inactive issues and pull requests
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-issue-label: "inactive-30d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 30
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-pr-label: "inactive-30d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 30
-          days-before-pr-close: -1
-          operations-per-run: 50
-  mark-inactive-90d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 90 day inactive issues and pull requests
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "inactive-90d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 90
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "inactive-90d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 90
-          days-before-pr-close: -1
-          operations-per-run: 50
diff --git a/README.md b/README.md
index 2c0231f37e..79ab874c27 100755
--- a/README.md
+++ b/README.md
@@ -12,19 +12,19 @@ While not exhaustive, the following general categories help summarize the accele
 | Category | Examples |
 | --- | --- |
 | **Data Formats** | sparse & dense, conversions, data generation |
-| **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
+| **Dense Operations** | linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, components & labeling |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
-| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
 | **Solvers** | combinatorial optimization, iterative solvers |
 | **Statistics** | sampling, moments and summary statistics, metrics |
-| **Distributed Tools** | multi-node multi-gpu infrastructure |
+| **Tools & Utilities** | common utilities for developing CUDA applications, multi-node multi-gpu infrastructure |
 
 RAFT provides a header-only C++ library and pre-compiled shared libraries that can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
-RAFT also provides 2 Python libraries:
-- `pylibraft` - low-level Python wrappers around RAFT algorithms and primitives.
-- `raft-dask` - reusable infrastructure for building analytics, including tools for building both single-GPU and multi-node multi-GPU algorithms.
+In addition to the C++ library, RAFT also provides 2 Python libraries:
+- `pylibraft` - lightweight low-level Python wrappers around RAFT algorithms and primitives.
+- `raft-dask` - multi-node multi-GPU communicator infrastructure for building distributed algorithms on the GPU with Dask.
 
 ## Getting started
 
@@ -78,9 +78,9 @@ raft::distance::pairwise_distance(handle, input.view(), input.view(), output.vie
 
 ### Python Example
 
-The `pylibraft` package contains a Python API for RAFT algorithms and primitives. The package is currently limited to pairwise distances, and we will continue adding more.
+The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The package is currently limited to pairwise distances and RMAT graph generation, but we will continue adding more in future releases.
 
-The example below demonstrates computing the pairwise Euclidean distances between cupy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function.
+The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function. Note that CuPy is not a required dependency for `pylibraft`.
 
 ```python
 import cupy as cp
@@ -99,7 +99,7 @@ pairwise_distance(in1, in2, output, metric="euclidean")
 
 ## Installing
 
-RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](BUILD.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
+RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
 
 ### Conda
 
@@ -107,7 +107,7 @@ The easiest way to install RAFT is through conda and several packages are provid
 - `libraft-headers` RAFT headers
 - `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
 - `libraft-distance` (optional) contains shared libraries for distance primitives.
-- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
+- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 
 Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
@@ -119,7 +119,7 @@ You can also install the `libraft-*` conda packages individually using the `mamb
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
 
-### CPM
+### Cmake & CPM
 
 RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS CMake provides a convenience layer around CPM. 
 
@@ -186,7 +186,7 @@ mamba activate raft_dev_env
 ./build.sh raft-dask pylibraft libraft tests bench --compile-libs
 ```
 
-The [build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) section of the build instructions.
+The [build](docs/source/build.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](docs/source/build.md#building-raft-c-from-source-in-cmake) section of the build instructions.
 
 ## Folder Structure and Contents
 
@@ -198,11 +198,29 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
   - `bench`: Benchmarks source code
   - `cmake`: Cmake modules and templates
   - `doxygen`: Doxygen configuration
-  - `include`: The C++ API headers are fully-contained here
+  - `include`: The C++ API headers are fully-contained here (deprecated directories are excluded from the listing below)
+    - `cluster`: Basic clustering primitives and algorithms.
+    - `comms`: A multi-node multi-GPU communications abstraction layer for NCCL+UCX and MPI+NCCL, which can be deployed in Dask clusters using the `raft-dask` Python package.
+    - `core`: Core API headers which require minimal dependencies aside from RMM and Cudatoolkit. These are safe to expose on public APIs and do not require `nvcc` to build. This is the same for any headers in RAFT which have the suffix `*_types.hpp`. 
+    - `distance`: Distance primitives
+    - `linalg`: Dense linear algebra
+    - `matrix`: Dense matrix operations
+    - `neighbors`: Nearest neighbors and knn graph construction
+    - `random`: Random number generation, sampling, and data generation primitives
+    - `solver`: Iterative and combinatorial solvers for optimization and approximation
+    - `sparse`: Sparse matrix operations
+      - `convert`: Sparse conversion functions
+      - `distance`: Sparse distance computations
+      - `linalg`: Sparse linear algebra
+      - `neighbors`: Sparse nearest neighbors and knn graph construction
+      - `op`: Various sparse operations such as slicing and filtering (Note: this will soon be renamed to `sparse/matrix`)
+      - `solver`: Sparse solvers for optimization and approximation
+    - `stats`: Moments, summary statistics, model performance measures
+    - `util`: Various reusable tools and utilities for accelerated algorithm development
   - `scripts`: Helpful scripts for development
   - `src`: Compiled APIs and template specializations for the shared libraries
   - `test`: Googletests source code
-- `docs`: Source code and scripts for building library documentation (doxygen + pydocs)
+- `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs)
 - `python`: Source code for Python libraries.
   - `pylibraft`: Python build and source code for pylibraft library
   - `raft-dask`: Python build and source code for raft-dask library
diff --git a/build.sh b/build.sh
index d1dd8bdde1..61e6d1a007 100755
--- a/build.sh
+++ b/build.sh
@@ -40,8 +40,8 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
                                   the only option to be supported)
    --minimal-deps              - disables dependencies like thrust so they can be overridden.
                                  can be useful for a pure header-only install
-   --limit-tests               - semicolon-separated list of test executables to compile (e.g. SPATIAL_TEST;CLUSTER_TEST)
-   --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. SPATIAL_BENCH;CLUSTER_BENCH)
+   --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
+   --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. NEIGHBORS_BENCH;CLUSTER_BENCH)
    --allgpuarch                - build for all supported GPU architectures
    --buildfaiss                - build faiss statically into raft
    --install                   - install cmake targets
@@ -72,8 +72,8 @@ COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
 
-TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NN_TEST;SPATIAL_TEST;STATS_TEST;UTILS_TEST"
-BENCH_TARGETS="CLUSTER_BENCH;SPATIAL_BENCH;DISTANCE_BENCH;LINALG_BENCH;SPARSE_BENCH;RANDOM_BENCH"
+TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
+BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;SPARSE_BENCH;RANDOM_BENCH"
 ENABLE_thrust_DEPENDENCY=ON
 
 CACHE_ARGS=""
@@ -227,18 +227,50 @@ fi
 
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
-    COMPILE_DIST_LIBRARY=ON
-    ENABLE_NN_DEPENDENCIES=ON
-    COMPILE_NN_LIBRARY=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
+
+    # Force compile nn library when needed test targets are specified
+    if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \
+          $CMAKE_TARGET == *"SPARSE_DIST_TEST"* || \
+          $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_TEST"* || \
+          $CMAKE_TARGET == *"STATS_TEST"* ]]; then
+      echo "-- Enabling nearest neighbors lib for gtests"
+      ENABLE_NN_DEPENDENCIES=ON
+      COMPILE_NN_LIBRARY=ON
+    fi
+
+    # Force compile distance library when needed test targets are specified
+    if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \
+          $CMAKE_TARGET == *"DISTANCE_TEST"* || \
+          $CMAKE_TARGET == *"SPARSE_DIST_TEST" || \
+          $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_TEST" || \
+          $CMAKE_TARGET == *"STATS_TEST"* ]]; then
+      echo "-- Enabling distance lib for gtests"
+      COMPILE_DIST_LIBRARY=ON
+    fi
 fi
 
 if hasArg bench || (( ${NUMARGS} == 0 )); then
     BUILD_BENCH=ON
-    COMPILE_DIST_LIBRARY=ON
-    ENABLE_NN_DEPENDENCIES=ON
-    COMPILE_NN_LIBRARY=ON
     CMAKE_TARGET="${CMAKE_TARGET};${BENCH_TARGETS}"
+
+    # Force compile nn library when needed benchmark targets are specified
+    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_BENCH"*  ]]; then
+      echo "-- Enabling nearest neighbors lib for benchmarks"
+      ENABLE_NN_DEPENDENCIES=ON
+      COMPILE_NN_LIBRARY=ON
+    fi
+
+    # Force compile distance library when needed benchmark targets are specified
+    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_BENCH"* ]]; then
+      echo "-- Enabling distance lib for benchmarks"
+      COMPILE_DIST_LIBRARY=ON
+    fi
+
 fi
 
 if hasArg --buildfaiss; then
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index d41730c89c..53041be957 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -33,6 +33,12 @@ unset GIT_DESCRIBE_TAG
 # ucx-py version
 export UCX_PY_VERSION='0.29.*'
 
+# Whether to install dask nightly or stable packages.
+export INSTALL_DASK_MAIN=1
+
+# Dask version to install when `INSTALL_DASK_MAIN=0`
+export DASK_STABLE_VERSION="2022.9.2"
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -80,11 +86,17 @@ if hasArg --skip-tests; then
     exit 0
 fi
 
-# Install the master version of dask, distributed, and dask-ml
-gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@2022.9.2" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2022.9.2" --upgrade --no-deps
+# Install latest nightly version for dask and distributed depending on `INSTALL_DASK_MAIN`
+if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
+  gpuci_logger "Installing dask and distributed from dask nightly channel"
+  gpuci_mamba_retry install -c dask/label/dev \
+    "dask/label/dev::dask" \
+    "dask/label/dev::distributed"
+else
+  gpuci_logger "gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall"
+  gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall
+fi
 set +x
 
 gpuci_logger "Check GPU usage"
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index 6749b4be06..3e0954f663 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -3,6 +3,7 @@ channels:
 - rapidsai
 - nvidia
 - rapidsai-nightly
+- dask/label/dev
 - conda-forge
 dependencies:
 - c-compiler
@@ -13,6 +14,8 @@ dependencies:
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
 - cmake>=3.23.1
+- dask>=2022.9.2
+- distributed>=2022.9.2
 - scikit-build>=0.13.1
 - rapids-build-env=22.12.*
 - rapids-notebook-env=22.12.*
@@ -30,8 +33,6 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.9.2
-    - git+https://github.com/dask/distributed.git@2022.9.2
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index edb6b8241b..33aa343baf 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -3,6 +3,7 @@ channels:
 - rapidsai
 - nvidia
 - rapidsai-nightly
+- dask/label/dev
 - conda-forge
 dependencies:
 - c-compiler
@@ -13,6 +14,8 @@ dependencies:
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
 - cmake>=3.23.1
+- dask>=2022.9.2
+- distributed>=2022.9.2
 - scikit-build>=0.13.1
 - rapids-build-env=22.12.*
 - rapids-notebook-env=22.12.*
@@ -30,8 +33,6 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.9.2
-    - git+https://github.com/dask/distributed.git@2022.9.2
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 918d6db76a..6c8bd3ad50 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -3,6 +3,7 @@ channels:
 - rapidsai
 - nvidia
 - rapidsai-nightly
+- dask/label/dev
 - conda-forge
 dependencies:
 - c-compiler
@@ -13,6 +14,8 @@ dependencies:
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
 - cmake>=3.23.1
+- dask>=2022.9.2
+- distributed>=2022.9.2
 - scikit-build>=0.13.1
 - rapids-build-env=22.12.*
 - rapids-notebook-env=22.12.*
@@ -30,8 +33,6 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.9.2
-    - git+https://github.com/dask/distributed.git@2022.9.2
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 00c577a896..12d202c265 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -3,6 +3,7 @@ channels:
 - rapidsai
 - nvidia
 - rapidsai-nightly
+- dask/label/dev
 - conda-forge
 dependencies:
 - c-compiler
@@ -14,6 +15,8 @@ dependencies:
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
 - cmake>=3.23.1
+- dask>=2022.9.2
+- distributed>=2022.9.2
 - scikit-build>=0.13.1
 - rapids-build-env=22.12.*
 - rapids-notebook-env=22.12.*
@@ -31,8 +34,6 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.9.2
-    - git+https://github.com/dask/distributed.git@2022.9.2
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 4e10294db7..5213ae2b8b 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -51,8 +51,8 @@ requirements:
     - ucx >={{ ucx_version }}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
-    - dask==2022.9.2
-    - distributed==2022.9.2
+    - dask>=2022.9.2
+    - distributed>=2022.9.2
     - cuda-python >=11.5,<11.7.1
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12bebfa2a5..3998a7c024 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -167,7 +167,8 @@ if(BUILD_TESTS)
 endif()
 
 if(BUILD_BENCH)
-  include(cmake/thirdparty/get_gbench.cmake)
+  include(${rapids-cmake-dir}/cpm/gbench.cmake)
+  rapids_cpm_gbench()
 endif()
 
 ##############################################################################
@@ -244,15 +245,26 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 if(RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib
     src/distance/pairwise_distance.cu
+    src/distance/fused_l2_min_arg.cu
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
     src/distance/specializations/detail/cosine.cu
+    src/distance/specializations/detail/cosine.cu
     src/distance/specializations/detail/hamming_unexpanded.cu
     src/distance/specializations/detail/hellinger_expanded.cu
     src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
     src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
     src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+# These are somehow missing a kernel definition which is causing a compile error.
+#    src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+#    src/distance/specializations/detail/kernels/rbf_kernel_float.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
     src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 51170e4265..e0f42d1803 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -30,8 +30,6 @@ set(BENCH_NAME ${ConfigureBench_NAME})
 
 add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
 
-message("BENCH PATH: ${ConfigureBench_PATH}")
-
 target_link_libraries(${BENCH_NAME}
         PRIVATE
         raft::raft
@@ -87,6 +85,7 @@ if(BUILD_BENCH)
             bench/distance/distance_exp_l2.cu
             bench/distance/distance_l1.cu
             bench/distance/distance_unexp_l2.cu
+            bench/distance/kernels.cu
             bench/main.cpp
             OPTIONAL DIST
             )
@@ -96,6 +95,7 @@ if(BUILD_BENCH)
             bench/linalg/add.cu
             bench/linalg/map_then_reduce.cu
             bench/linalg/matrix_vector_op.cu
+            bench/linalg/reduce_rows_by_key.cu
             bench/linalg/reduce.cu
             bench/main.cpp
             )
@@ -114,20 +114,20 @@ if(BUILD_BENCH)
             bench/main.cpp
             )
 
-    ConfigureBench(NAME SPATIAL_BENCH
+    ConfigureBench(NAME NEIGHBORS_BENCH
             PATH
-            bench/spatial/fused_l2_nn.cu
-            bench/spatial/knn/brute_force_float_int64_t.cu
-            bench/spatial/knn/brute_force_float_uint32_t.cu
-            bench/spatial/knn/ivf_flat_float_int64_t.cu
-            bench/spatial/knn/ivf_flat_float_uint32_t.cu
-            bench/spatial/knn/ivf_flat_int8_t_int64_t.cu
-            bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu
-            bench/spatial/knn/ivf_pq_float_int64_t.cu
-            bench/spatial/knn/ivf_pq_float_uint32_t.cu
-            bench/spatial/knn/ivf_pq_int8_t_int64_t.cu
-            bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu
-            bench/spatial/selection.cu
+            bench/neighbors/fused_l2_nn.cu
+            bench/neighbors/knn/brute_force_float_int64_t.cu
+            bench/neighbors/knn/brute_force_float_uint32_t.cu
+            bench/neighbors/knn/ivf_flat_float_int64_t.cu
+            bench/neighbors/knn/ivf_flat_float_uint32_t.cu
+            bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+            bench/neighbors/knn/ivf_flat_uint8_t_uint32_t.cu
+            bench/neighbors/knn/ivf_pq_float_int64_t.cu
+            bench/neighbors/knn/ivf_pq_float_uint32_t.cu
+            bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+            bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
+            bench/neighbors/selection.cu
             bench/main.cpp
             OPTIONAL DIST NN
             )
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index adfe5218e2..13ca40a033 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -53,8 +53,9 @@ struct using_pool_memory_res {
     rmm::mr::set_current_device_resource(&pool_res_);
   }
 
-  using_pool_memory_res() : using_pool_memory_res(size_t(1) << size_t(30), size_t(16) << size_t(30))
+  using_pool_memory_res() : orig_res_(rmm::mr::get_current_device_resource()), pool_res_(&cuda_res_)
   {
+    rmm::mr::set_current_device_resource(&pool_res_);
   }
 
   ~using_pool_memory_res() { rmm::mr::set_current_device_resource(orig_res_); }
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
index 4f1a8ccab1..73faacce37 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/distance/distance_common.cuh
@@ -15,8 +15,8 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
+#include <raft/util/cudart_utils.hpp>
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/distance/kernels.cu
new file mode 100644
index 0000000000..5c9c2cc2ed
--- /dev/null
+++ b/cpp/bench/distance/kernels.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#include <common/benchmark.hpp>
+#include <memory>
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/random/rng.cuh>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace raft::bench::distance::kernels {
+
+using namespace raft::distance::kernels;
+struct GramTestParams {
+  int m;  // m parameter of the GEMM
+  int k;  // k parameter of the GEMM
+  int n;  // n parameter of the GEMM
+  KernelParams kernel_params;
+  bool is_row_major;
+};  // struct GramTestParams
+
+template <typename T>
+struct GramMatrix : public fixture {
+  GramMatrix(const GramTestParams& p)
+    : params(p), handle(stream), A(0, stream), B(0, stream), C(0, stream)
+  {
+    kernel = std::unique_ptr<GramMatrixBase<T>>(
+      KernelFactory<T>::create(p.kernel_params, handle.get_cublas_handle()));
+
+    A.resize(params.m * params.k, stream);
+    B.resize(params.k * params.n, stream);
+    C.resize(params.m * params.n, stream);
+    raft::random::Rng r(123456ULL);
+    r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream);
+    r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream);
+  }
+
+  ~GramMatrix()
+  {
+    A.release();
+    B.release();
+    C.release();
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); }
+    loop_on_state(state, [this]() {
+      (*this->kernel)(A.data(),
+                      this->params.m,
+                      this->params.k,
+                      B.data(),
+                      this->params.n,
+                      C.data(),
+                      this->params.is_row_major,
+                      this->stream);
+    });
+  }
+
+ private:
+  const raft::handle_t handle;
+  std::unique_ptr<GramMatrixBase<T>> kernel;
+  GramTestParams params;
+
+  rmm::device_uvector<T> A;  // input matrix A, size [m * k]
+  rmm::device_uvector<T> B;  // input matrix B, size [n * k]
+  rmm::device_uvector<T> C;  // output matrix C, size [m*n]
+};
+
+static std::vector<GramTestParams> getInputs()
+{
+  std::vector<GramTestParams> param_vec;
+  std::vector<KernelParams> kernel_params{KernelParams{LINEAR, 3, 1, 0},
+                                          KernelParams{POLYNOMIAL, 2, 1.3, 1},
+                                          KernelParams{TANH, 2, 0.5, 2.4},
+                                          KernelParams{RBF, 2, 0.5, 0}};
+  struct TestSize {
+    int m;
+    int k;
+    int n;
+  };
+  std::vector<TestSize> data_size{{4096, 10, 1024},
+                                  {4096, 100, 1024},
+                                  {4096, 1000, 1024},
+                                  {4096, 10000, 1024},
+                                  {100000, 10, 1024},
+                                  {100000, 100, 1024},
+                                  {100000, 1000, 1024}};
+
+  param_vec.reserve(kernel_params.size() * data_size.size());
+  for (TestSize s : data_size) {
+    for (auto kernel : kernel_params) {
+      for (bool row_major : {false, true}) {
+        param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major});
+      }
+    }
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER(GramMatrix<float>, "", getInputs());
+RAFT_BENCH_REGISTER(GramMatrix<double>, "", getInputs());
+
+}  // namespace raft::bench::distance::kernels
diff --git a/cpp/bench/linalg/reduce_rows_by_key.cu b/cpp/bench/linalg/reduce_rows_by_key.cu
new file mode 100644
index 0000000000..075bc7c8c4
--- /dev/null
+++ b/cpp/bench/linalg/reduce_rows_by_key.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/random/rng.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct rrbk_params {
+  int64_t rows, cols;
+  int64_t keys;
+};
+
+template <typename T, typename KeyT>
+struct reduce_rows_by_key : public fixture {
+  reduce_rows_by_key(const rrbk_params& p)
+    : params(p),
+      in(p.rows * p.cols, stream),
+      out(p.keys * p.cols, stream),
+      keys(p.rows, stream),
+      workspace(p.rows, stream)
+  {
+    raft::random::RngState rng{42};
+    raft::random::uniformInt(rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::reduce_rows_by_key(in.data(),
+                                       params.cols,
+                                       keys.data(),
+                                       workspace.data(),
+                                       params.rows,
+                                       params.cols,
+                                       params.keys,
+                                       out.data(),
+                                       stream,
+                                       false);
+    });
+  }
+
+ protected:
+  rrbk_params params;
+  rmm::device_uvector<T> in, out;
+  rmm::device_uvector<KeyT> keys;
+  rmm::device_uvector<char> workspace;
+};  // struct reduce_rows_by_key
+
+const std::vector<rrbk_params> kInputSizes{
+  {10000, 128, 64},
+  {100000, 128, 64},
+  {1000000, 128, 64},
+  {10000000, 128, 64},
+  {10000, 128, 256},
+  {100000, 128, 256},
+  {1000000, 128, 256},
+  {10000000, 128, 256},
+  {10000, 128, 1024},
+  {100000, 128, 1024},
+  {1000000, 128, 1024},
+  {10000000, 128, 1024},
+  {10000, 128, 4096},
+  {100000, 128, 4096},
+  {1000000, 128, 4096},
+  {10000000, 128, 4096},
+};
+
+RAFT_BENCH_REGISTER((reduce_rows_by_key<float, uint32_t>), "", kInputSizes);
+RAFT_BENCH_REGISTER((reduce_rows_by_key<double, uint32_t>), "", kInputSizes);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/neighbors/fused_l2_nn.cu
similarity index 100%
rename from cpp/bench/spatial/fused_l2_nn.cu
rename to cpp/bench/neighbors/fused_l2_nn.cu
diff --git a/cpp/bench/spatial/knn.cuh b/cpp/bench/neighbors/knn.cuh
similarity index 95%
rename from cpp/bench/spatial/knn.cuh
rename to cpp/bench/neighbors/knn.cuh
index bb01320cdf..d38631b289 100644
--- a/cpp/bench/spatial/knn.cuh
+++ b/cpp/bench/neighbors/knn.cuh
@@ -20,8 +20,8 @@
 
 #include <raft/random/rng.cuh>
 
-#include <raft/spatial/knn/ivf_flat.cuh>
-#include <raft/spatial/knn/ivf_pq.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_pq.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
 #if defined RAFT_DISTANCE_COMPILED
@@ -143,16 +143,16 @@ template <typename ValT, typename IdxT>
 struct ivf_flat_knn {
   using dist_t = float;
 
-  std::optional<const raft::spatial::knn::ivf_flat::index<ValT, IdxT>> index;
-  raft::spatial::knn::ivf_flat::index_params index_params;
-  raft::spatial::knn::ivf_flat::search_params search_params;
+  std::optional<const raft::neighbors::ivf_flat::index<ValT, IdxT>> index;
+  raft::neighbors::ivf_flat::index_params index_params;
+  raft::neighbors::ivf_flat::search_params search_params;
   params ps;
 
   ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
   {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
-    index.emplace(raft::spatial::knn::ivf_flat::build(
+    index.emplace(raft::neighbors::ivf_flat::build(
       handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
@@ -162,7 +162,7 @@ struct ivf_flat_knn {
               IdxT* out_idxs)
   {
     search_params.n_probes = 20;
-    raft::spatial::knn::ivf_flat::search(
+    raft::neighbors::ivf_flat::search(
       handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists);
   }
 };
@@ -171,16 +171,16 @@ template <typename ValT, typename IdxT>
 struct ivf_pq_knn {
   using dist_t = float;
 
-  std::optional<const raft::spatial::knn::ivf_pq::index<IdxT>> index;
-  raft::spatial::knn::ivf_pq::index_params index_params;
-  raft::spatial::knn::ivf_pq::search_params search_params;
+  std::optional<const raft::neighbors::ivf_pq::index<IdxT>> index;
+  raft::neighbors::ivf_pq::index_params index_params;
+  raft::neighbors::ivf_pq::search_params search_params;
   params ps;
 
   ivf_pq_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
   {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
-    index.emplace(raft::spatial::knn::ivf_pq::build(
+    index.emplace(raft::neighbors::ivf_pq::build(
       handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
@@ -190,7 +190,7 @@ struct ivf_pq_knn {
               IdxT* out_idxs)
   {
     search_params.n_probes = 20;
-    raft::spatial::knn::ivf_pq::search(
+    raft::neighbors::ivf_pq::search(
       handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists);
   }
 };
diff --git a/cpp/bench/spatial/knn/brute_force_float_int64_t.cu b/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/brute_force_float_int64_t.cu
rename to cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
diff --git a/cpp/bench/spatial/knn/brute_force_float_uint32_t.cu b/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/brute_force_float_uint32_t.cu
rename to cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_flat_float_int64_t.cu b/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_flat_float_int64_t.cu
rename to cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_flat_float_uint32_t.cu b/cpp/bench/neighbors/knn/ivf_flat_float_uint32_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_flat_float_uint32_t.cu
rename to cpp/bench/neighbors/knn/ivf_flat_float_uint32_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_flat_int8_t_int64_t.cu b/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_flat_int8_t_int64_t.cu
rename to cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu b/cpp/bench/neighbors/knn/ivf_flat_uint8_t_uint32_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_flat_uint8_t_uint32_t.cu
rename to cpp/bench/neighbors/knn/ivf_flat_uint8_t_uint32_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_pq_float_int64_t.cu b/cpp/bench/neighbors/knn/ivf_pq_float_int64_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_pq_float_int64_t.cu
rename to cpp/bench/neighbors/knn/ivf_pq_float_int64_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_pq_float_uint32_t.cu b/cpp/bench/neighbors/knn/ivf_pq_float_uint32_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_pq_float_uint32_t.cu
rename to cpp/bench/neighbors/knn/ivf_pq_float_uint32_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_pq_int8_t_int64_t.cu b/cpp/bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_pq_int8_t_int64_t.cu
rename to cpp/bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
diff --git a/cpp/bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu b/cpp/bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
similarity index 100%
rename from cpp/bench/spatial/knn/ivf_pq_uint8_t_uint32_t.cu
rename to cpp/bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/neighbors/selection.cu
similarity index 100%
rename from cpp/bench/spatial/selection.cu
rename to cpp/bench/neighbors/selection.cu
diff --git a/cpp/cmake/thirdparty/get_gbench.cmake b/cpp/cmake/thirdparty/get_gbench.cmake
deleted file mode 100644
index a3d5678f74..0000000000
--- a/cpp/cmake/thirdparty/get_gbench.cmake
+++ /dev/null
@@ -1,43 +0,0 @@
-#=============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_gbench)
-
-    set(oneValueArgs VERSION PINNED_TAG)
-    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN} )
-
-    rapids_cpm_find(benchmark ${PKG_VERSION}
-        GLOBAL_TARGETS benchmark::benchmark
-        CPM_ARGS
-            GIT_REPOSITORY  https://github.com/google/benchmark.git
-            GIT_TAG         ${PKG_PINNED_TAG}
-            OPTIONS
-              "BENCHMARK_ENABLE_GTEST_TESTS OFF"
-              "BENCHMARK_ENABLE_TESTING OFF"
-              "BENCHMARK_ENABLE_INSTALL OFF"
-              "CMAKE_BUILD_TYPE Release"
-              "CMAKE_INSTALL_LIBDIR lib"
-    )
-
-    if(NOT TARGET benchmark::benchmark)
-        add_library(benchmark::benchmark ALIAS benchmark)
-    endif()
-
-endfunction()
-
-find_and_configure_gbench(VERSION      1.5.3
-                          PINNED_TAG   c05843a9f622db08ad59804c190f98879b76beba)
diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in
index 5517562a9f..07056e503d 100644
--- a/cpp/doxygen/Doxyfile.in
+++ b/cpp/doxygen/Doxyfile.in
@@ -900,7 +900,9 @@ EXCLUDE                = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/s
                          @CMAKE_CURRENT_SOURCE_DIR@/include/raft/span.hpp \
                          @CMAKE_CURRENT_SOURCE_DIR@/include/raft/vectorized.cuh \
                          @CMAKE_CURRENT_SOURCE_DIR@/include/raft/raft.hpp \
-                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/core/cudart_utils.hpp
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/core/cudart_utils.hpp \
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/matrix/math.cuh \
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/matrix/matrix.cuh
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index da8adf783d..a07045f0d2 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -27,7 +27,7 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/spatial/knn_graph.cuh>
+#include <raft/sparse/neighbors/knn_graph.cuh>
 
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
@@ -73,7 +73,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
     // Need to symmetrize knn into undirected graph
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
-    raft::sparse::spatial::knn_graph(handle, X, m, n, metric, knn_graph_coo, c);
+    raft::sparse::neighbors::knn_graph(handle, X, m, n, metric, knn_graph_coo, c);
 
     indices.resize(knn_graph_coo.nnz, stream);
     data.resize(knn_graph_coo.nnz, stream);
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index 67935d4623..8143d21641 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -19,9 +19,9 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <raft/sparse/neighbors/connect_components.cuh>
 #include <raft/sparse/op/sort.cuh>
 #include <raft/sparse/solver/mst.cuh>
-#include <raft/sparse/spatial/connect_components.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -80,7 +80,7 @@ void connect_knn_graph(
 
   raft::sparse::COO<value_t, value_idx> connected_edges(stream);
 
-  raft::sparse::spatial::connect_components<value_idx, value_t>(
+  raft::sparse::neighbors::connect_components<value_idx, value_t>(
     handle, connected_edges, X, color, m, n, reduction_op);
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
@@ -153,14 +153,14 @@ void build_sorted_mst(
     handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true);
 
   int iters        = 1;
-  int n_components = raft::sparse::spatial::get_n_components(color, m, stream);
+  int n_components = raft::sparse::neighbors::get_n_components(color, m, stream);
 
   while (n_components > 1 && iters < max_iter) {
     connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color, reduction_op);
 
     iters++;
 
-    n_components = raft::sparse::spatial::get_n_components(color, m, stream);
+    n_components = raft::sparse::neighbors::get_n_components(color, m, stream);
   }
 
   /**
diff --git a/cpp/include/raft/cluster/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh
index 9eee21b09c..d12db85e1b 100644
--- a/cpp/include/raft/cluster/detail/single_linkage.cuh
+++ b/cpp/include/raft/cluster/detail/single_linkage.cuh
@@ -80,7 +80,7 @@ void single_linkage(const raft::handle_t& handle,
    * 2. Construct MST, sorted by weights
    */
   rmm::device_uvector<value_idx> color(m, stream);
-  raft::sparse::spatial::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
+  raft::sparse::neighbors::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
   detail::build_sorted_mst<value_idx, value_t>(handle,
                                                X,
                                                indptr.data(),
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index 0ce35da4a5..ef1fb44dfd 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -21,7 +21,509 @@
 #include <raft/core/kvp.hpp>
 #include <raft/core/mdarray.hpp>
 
+namespace raft::cluster::kmeans {
+
+/**
+ * Functor used for sampling centroids
+ */
+template <typename DataT, typename IndexT>
+using SamplingOp = detail::SamplingOp<DataT, IndexT>;
+
+/**
+ * Functor used to extract the index from a KeyValue pair
+ * storing both index and a distance.
+ */
+template <typename IndexT, typename DataT>
+using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
+
+/**
+ * @brief Find clusters with k-means algorithm.
+ *   Initial centroids are chosen with k-means++ algorithm. Empty
+ *   clusters are reinitialized by choosing new centroids with
+ *   k-means++ algorithm.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans.cuh>
+ *   #include <raft/cluster/kmeans_types.hpp>
+ *   using namespace raft::cluster;
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::KMeansParams params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids,
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] When init is InitMethod::Array, use
+ *                              centroids as the initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+template <typename DataT, typename IndexT>
+void fit(handle_t const& handle,
+         const KMeansParams& params,
+         raft::device_matrix_view<const DataT, IndexT> X,
+         std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+         raft::device_matrix_view<DataT, IndexT> centroids,
+         raft::host_scalar_view<DataT> inertia,
+         raft::host_scalar_view<IndexT> n_iter)
+{
+  detail::kmeans_fit<DataT, IndexT>(handle, params, X, sample_weight, centroids, inertia, n_iter);
+}
+
+/**
+ * @brief Predict the closest cluster each sample in X belongs to.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans.cuh>
+ *   #include <raft/cluster/kmeans_types.hpp>
+ *   using namespace raft::cluster;
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::KMeansParams params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
+ *
+ *   kmeans::fit(handle,
+ *               params,
+ *               X,
+ *               std::nullopt,
+ *               centroids.view(),
+ *               raft::make_scalar_view(&inertia),
+ *               raft::make_scalar_view(&n_iter));
+ *   ...
+ *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
+ *
+ *   kmeans::predict(handle,
+ *                   params,
+ *                   X,
+ *                   std::nullopt,
+ *                   centroids.view(),
+ *                   false,
+ *                   labels.view(),
+ *                   raft::make_scalar_view(&ineratia));
+ * @endcode
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ * @param[in]     handle           The raft handle.
+ * @param[in]     params           Parameters for KMeans model.
+ * @param[in]     X                New data to predict.
+ *                                 [dim = n_samples x n_features]
+ * @param[in]     sample_weight    Optional weights for each observation in X.
+ *                                 [len = n_samples]
+ * @param[in]     centroids        Cluster centroids. The data must be in
+ *                                 row-major format.
+ *                                 [dim = n_clusters x n_features]
+ * @param[in]     normalize_weight True if the weights should be normalized
+ * @param[out]    labels           Index of the cluster each sample in X
+ *                                 belongs to.
+ *                                 [len = n_samples]
+ * @param[out]    inertia          Sum of squared distances of samples to
+ *                                 their closest cluster center.
+ */
+template <typename DataT, typename IndexT>
+void predict(handle_t const& handle,
+             const KMeansParams& params,
+             raft::device_matrix_view<const DataT, IndexT> X,
+             std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+             raft::device_matrix_view<const DataT, IndexT> centroids,
+             raft::device_vector_view<IndexT, IndexT> labels,
+             bool normalize_weight,
+             raft::host_scalar_view<DataT> inertia)
+{
+  detail::kmeans_predict<DataT, IndexT>(
+    handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
+}
+
+/**
+ * @brief Compute k-means clustering and predicts cluster index for each sample
+ * in the input.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans.cuh>
+ *   #include <raft/cluster/kmeans_types.hpp>
+ *   using namespace raft::cluster;
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::KMeansParams params;
+ *   int n_features = 15, inertia, n_iter;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
+ *   auto labels = raft::make_device_vector<int, int>(handle, X.extent(0));
+ *
+ *   kmeans::fit_predict(handle,
+ *                       params,
+ *                       X,
+ *                       std::nullopt,
+ *                       centroids.view(),
+ *                       labels.view(),
+ *                       raft::make_scalar_view(&inertia),
+ *                       raft::make_scalar_view(&n_iter));
+ * @endcode
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must be
+ *                              in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Optional weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     Optional
+ *                              [in] When init is InitMethod::Array, use
+ *                              centroids  as the initial cluster centers
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    labels        Index of the cluster each sample in X belongs
+ *                              to.
+ *                              [len = n_samples]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ */
+template <typename DataT, typename IndexT>
+void fit_predict(handle_t const& handle,
+                 const KMeansParams& params,
+                 raft::device_matrix_view<const DataT, IndexT> X,
+                 std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+                 std::optional<raft::device_matrix_view<DataT, IndexT>> centroids,
+                 raft::device_vector_view<IndexT, IndexT> labels,
+                 raft::host_scalar_view<DataT> inertia,
+                 raft::host_scalar_view<IndexT> n_iter)
+{
+  detail::kmeans_fit_predict<DataT, IndexT>(
+    handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
+}
+
+/**
+ * @brief Transform X to a cluster-distance space.
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format
+ *                              [dim = n_samples x n_features]
+ * @param[in]     centroids     Cluster centroids. The data must be in row-major format.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    X_new         X transformed in the new space.
+ *                              [dim = n_samples x n_features]
+ */
+template <typename DataT, typename IndexT>
+void transform(const raft::handle_t& handle,
+               const KMeansParams& params,
+               raft::device_matrix_view<const DataT, IndexT> X,
+               raft::device_matrix_view<const DataT, IndexT> centroids,
+               raft::device_matrix_view<DataT, IndexT> X_new)
+{
+  detail::kmeans_transform<DataT, IndexT>(handle, params, X, centroids, X_new);
+}
+
+template <typename DataT, typename IndexT>
+void transform(const raft::handle_t& handle,
+               const KMeansParams& params,
+               const DataT* X,
+               const DataT* centroids,
+               IndexT n_samples,
+               IndexT n_features,
+               DataT* X_new)
+{
+  detail::kmeans_transform<DataT, IndexT>(
+    handle, params, X, centroids, n_samples, n_features, X_new);
+}
+
+/**
+ * @brief Select centroids according to a sampling operation
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]  handle             The raft handle
+ * @param[in]  X                  The data in row-major format
+ *                                [dim = n_samples x n_features]
+ * @param[in]  minClusterDistance Distance for every sample to it's nearest centroid
+ *                                [dim = n_samples]
+ * @param[in]  isSampleCentroid   Flag the sample choosen as initial centroid
+ *                                [dim = n_samples]
+ * @param[in]  select_op          The sampling operation used to select the centroids
+ * @param[out] inRankCp           The sampled centroids
+ *                                [dim = n_selected_centroids x n_features]
+ * @param[in]  workspace          Temporary workspace buffer which can get resized
+ *
+ */
+template <typename DataT, typename IndexT>
+void sample_centroids(const raft::handle_t& handle,
+                      raft::device_matrix_view<const DataT, IndexT> X,
+                      raft::device_vector_view<DataT, IndexT> minClusterDistance,
+                      raft::device_vector_view<std::uint8_t, IndexT> isSampleCentroid,
+                      SamplingOp<DataT, IndexT>& select_op,
+                      rmm::device_uvector<DataT>& inRankCp,
+                      rmm::device_uvector<char>& workspace)
+{
+  detail::sampleCentroids<DataT, IndexT>(
+    handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace);
+}
+
+/**
+ * @brief Compute cluster cost
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam ReductionOpT the type of data used for the reduction operation.
+ *
+ * @param[in]  handle             The raft handle
+ * @param[in]  minClusterDistance Distance for every sample to it's nearest centroid
+ *                                [dim = n_samples]
+ * @param[in]  workspace          Temporary workspace buffer which can get resized
+ * @param[out] clusterCost        Resulting cluster cost
+ * @param[in]  reduction_op       The reduction operation used for the cost
+ *
+ */
+template <typename DataT, typename IndexT, typename ReductionOpT>
+void cluster_cost(const raft::handle_t& handle,
+                  raft::device_vector_view<DataT, IndexT> minClusterDistance,
+                  rmm::device_uvector<char> workspace,
+                  raft::device_scalar_view<DataT> clusterCost,
+                  ReductionOpT reduction_op)
+{
+  detail::computeClusterCost<DataT, ReductionOpT, IndexT>(
+    handle, minClusterDistance, workspace, clusterCost, reduction_op);
+}
+
+/**
+ * @brief Compute distance for every sample to it's nearest centroid
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]  handle               The raft handle
+ * @param[in]  params               The parameters for KMeans
+ * @param[in]  X                    The data in row-major format
+ *                                  [dim = n_samples x n_features]
+ * @param[in]  centroids            Centroids data
+ *                                  [dim = n_cluster x n_features]
+ * @param[out] minClusterDistance   Distance for every sample to it's nearest centroid
+ *                                  [dim = n_samples]
+ * @param[in]  L2NormX              L2 norm of X : ||x||^2
+ *                                  [dim = n_samples]
+ * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance
+ *                                  matrix
+ * @param[in]  workspace            Temporary workspace buffer which can get resized
+ *
+ */
+template <typename DataT, typename IndexT>
+void min_cluster_distance(const raft::handle_t& handle,
+                          const KMeansParams& params,
+                          raft::device_matrix_view<const DataT, IndexT> X,
+                          raft::device_matrix_view<DataT, IndexT> centroids,
+                          raft::device_vector_view<DataT, IndexT> minClusterDistance,
+                          raft::device_vector_view<DataT, IndexT> L2NormX,
+                          rmm::device_uvector<DataT>& L2NormBuf_OR_DistBuf,
+                          rmm::device_uvector<char>& workspace)
+{
+  detail::minClusterDistanceCompute<DataT, IndexT>(
+    handle, params, X, centroids, minClusterDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace);
+}
+
+/**
+ * @brief Calculates a <key, value> pair for every sample in input 'X' where key is an
+ * index of one of the 'centroids' (index of the nearest centroid) and 'value'
+ * is the distance between the sample and the 'centroid[key]'
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]  handle                The raft handle
+ * @param[in]  params                The parameters for KMeans
+ * @param[in]  X                     The data in row-major format
+ *                                   [dim = n_samples x n_features]
+ * @param[in]  centroids             Centroids data
+ *                                   [dim = n_cluster x n_features]
+ * @param[out] minClusterAndDistance Distance vector that contains for every sample, the nearest
+ *                                   centroid and it's distance
+ *                                   [dim = n_samples]
+ * @param[in]  L2NormX               L2 norm of X : ||x||^2
+ *                                   [dim = n_samples]
+ * @param[out] L2NormBuf_OR_DistBuf  Resizable buffer to store L2 norm of centroids or distance
+ *                                   matrix
+ * @param[in]  workspace             Temporary workspace buffer which can get resized
+ *
+ */
+template <typename DataT, typename IndexT>
+void min_cluster_and_distance(
+  const raft::handle_t& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<const DataT, IndexT> centroids,
+  raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT> minClusterAndDistance,
+  raft::device_vector_view<DataT, IndexT> L2NormX,
+  rmm::device_uvector<DataT>& L2NormBuf_OR_DistBuf,
+  rmm::device_uvector<char>& workspace)
+{
+  detail::minClusterAndDistanceCompute<DataT, IndexT>(
+    handle, params, X, centroids, minClusterAndDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace);
+}
+
+/**
+ * @brief Shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores
+ * in 'out' does not modify the input
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]  handle              The raft handle
+ * @param[in]  in                  The data to shuffle and gather
+ *                                 [dim = n_samples x n_features]
+ * @param[out] out                 The sampled data
+ *                                 [dim = n_samples_to_gather x n_features]
+ * @param[in]  n_samples_to_gather Number of sample to gather
+ * @param[in]  seed                Seed for the shuffle
+ * @param[in]  workspace           Temporary workspace buffer which can get resized
+ *
+ */
+template <typename DataT, typename IndexT>
+void shuffle_and_gather(const raft::handle_t& handle,
+                        raft::device_matrix_view<const DataT, IndexT> in,
+                        raft::device_matrix_view<DataT, IndexT> out,
+                        uint32_t n_samples_to_gather,
+                        uint64_t seed,
+                        rmm::device_uvector<char>* workspace = nullptr)
+{
+  detail::shuffleAndGather<DataT, IndexT>(handle, in, out, n_samples_to_gather, seed, workspace);
+}
+
+/**
+ * @brief Count the number of samples in each cluster
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]  handle               The raft handle
+ * @param[in]  params               The parameters for KMeans
+ * @param[in]  X                    The data in row-major format
+ *                                  [dim = n_samples x n_features]
+ * @param[in]  L2NormX              L2 norm of X : ||x||^2
+ *                                  [dim = n_samples]
+ * @param[in]  centroids            Centroids data
+ *                                  [dim = n_cluster x n_features]
+ * @param[in]  workspace            Temporary workspace buffer which can get resized
+ * @param[out] sampleCountInCluster The count for each centroid
+ *                                  [dim = n_cluster]
+ *
+ */
+template <typename DataT, typename IndexT>
+void count_samples_in_cluster(const raft::handle_t& handle,
+                              const KMeansParams& params,
+                              raft::device_matrix_view<const DataT, IndexT> X,
+                              raft::device_vector_view<DataT, IndexT> L2NormX,
+                              raft::device_matrix_view<DataT, IndexT> centroids,
+                              rmm::device_uvector<char>& workspace,
+                              raft::device_vector_view<DataT, IndexT> sampleCountInCluster)
+{
+  detail::countSamplesInCluster<DataT, IndexT>(
+    handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster);
+}
+
+/**
+ * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm.
+ *
+ * @see "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S.
+ *        ACM-SIAM symposium on Discrete algorithms.
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]  handle                The raft handle
+ * @param[in]  params                The parameters for KMeans
+ * @param[in]  X                     The data in row-major format
+ *                                   [dim = n_samples x n_features]
+ * @param[out] centroids             Centroids data
+ *                                   [dim = n_cluster x n_features]
+ * @param[in]  workspace             Temporary workspace buffer which can get resized
+ */
+template <typename DataT, typename IndexT>
+void init_plus_plus(const raft::handle_t& handle,
+                    const KMeansParams& params,
+                    raft::device_matrix_view<const DataT, IndexT> X,
+                    raft::device_matrix_view<DataT, IndexT> centroids,
+                    rmm::device_uvector<char>& workspace)
+{
+  detail::kmeansPlusPlus<DataT, IndexT>(handle, params, X, centroids, workspace);
+}
+
+/*
+ * @brief Main function used to fit KMeans (after cluster initialization)
+ *
+ * @tparam DataT the type of data used for weights, distances.
+ * @tparam IndexT the type of data used for indexing.
+ *
+ * @param[in]     handle        The raft handle.
+ * @param[in]     params        Parameters for KMeans model.
+ * @param[in]     X             Training instances to cluster. The data must
+ *                              be in row-major format.
+ *                              [dim = n_samples x n_features]
+ * @param[in]     sample_weight Weights for each observation in X.
+ *                              [len = n_samples]
+ * @param[inout]  centroids     [in] Initial cluster centers.
+ *                              [out] The generated centroids from the
+ *                              kmeans algorithm are stored at the address
+ *                              pointed by 'centroids'.
+ *                              [dim = n_clusters x n_features]
+ * @param[out]    inertia       Sum of squared distances of samples to their
+ *                              closest cluster center.
+ * @param[out]    n_iter        Number of iterations run.
+ * @param[in]     workspace     Temporary workspace buffer which can get resized
+ */
+template <typename DataT, typename IndexT>
+void fit_main(const raft::handle_t& handle,
+              const KMeansParams& params,
+              raft::device_matrix_view<const DataT, IndexT> X,
+              raft::device_vector_view<const DataT, IndexT> weight,
+              raft::device_matrix_view<DataT, IndexT> centroids,
+              raft::host_scalar_view<DataT> inertia,
+              raft::host_scalar_view<IndexT> n_iter,
+              rmm::device_uvector<char>& workspace)
+{
+  detail::kmeans_fit_main<DataT, IndexT>(
+    handle, params, X, weight, centroids, inertia, n_iter, workspace);
+}
+
+};  // end namespace raft::cluster::kmeans
+
 namespace raft::cluster {
+
+/**
+ * Note: All of the functions below in raft::cluster are deprecated and will
+ * be removed in a future release. Please use raft::cluster::kmeans instead.
+ */
+
 /**
  * @brief Find clusters with k-means algorithm.
  *   Initial centroids are chosen with k-means++ algorithm. Empty
@@ -55,7 +557,7 @@ void kmeans_fit(handle_t const& handle,
                 raft::host_scalar_view<DataT> inertia,
                 raft::host_scalar_view<IndexT> n_iter)
 {
-  detail::kmeans_fit<DataT, IndexT>(handle, params, X, sample_weight, centroids, inertia, n_iter);
+  kmeans::fit<DataT, IndexT>(handle, params, X, sample_weight, centroids, inertia, n_iter);
 }
 
 template <typename DataT, typename IndexT = int>
@@ -69,7 +571,7 @@ void kmeans_fit(handle_t const& handle,
                 DataT& inertia,
                 IndexT& n_iter)
 {
-  detail::kmeans_fit<DataT, IndexT>(
+  kmeans::fit<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, n_samples, n_features, inertia, n_iter);
 }
 
@@ -103,7 +605,7 @@ void kmeans_predict(handle_t const& handle,
                     bool normalize_weight,
                     raft::host_scalar_view<DataT> inertia)
 {
-  detail::kmeans_predict<DataT, IndexT>(
+  kmeans::predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
 }
 
@@ -119,16 +621,16 @@ void kmeans_predict(handle_t const& handle,
                     bool normalize_weight,
                     DataT& inertia)
 {
-  detail::kmeans_predict<DataT, IndexT>(handle,
-                                        params,
-                                        X,
-                                        sample_weight,
-                                        centroids,
-                                        n_samples,
-                                        n_features,
-                                        labels,
-                                        normalize_weight,
-                                        inertia);
+  kmeans::predict<DataT, IndexT>(handle,
+                                 params,
+                                 X,
+                                 sample_weight,
+                                 centroids,
+                                 n_samples,
+                                 n_features,
+                                 labels,
+                                 normalize_weight,
+                                 inertia);
 }
 
 /**
@@ -168,7 +670,7 @@ void kmeans_fit_predict(handle_t const& handle,
                         raft::host_scalar_view<DataT> inertia,
                         raft::host_scalar_view<IndexT> n_iter)
 {
-  detail::kmeans_fit_predict<DataT, IndexT>(
+  kmeans::fit_predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
 }
 
@@ -184,7 +686,7 @@ void kmeans_fit_predict(handle_t const& handle,
                         DataT& inertia,
                         IndexT& n_iter)
 {
-  detail::kmeans_fit_predict<DataT, IndexT>(
+  kmeans::fit_predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, n_samples, n_features, labels, inertia, n_iter);
 }
 
@@ -210,7 +712,7 @@ void kmeans_transform(const raft::handle_t& handle,
                       raft::device_matrix_view<const DataT, IndexT> centroids,
                       raft::device_matrix_view<DataT, IndexT> X_new)
 {
-  detail::kmeans_transform<DataT, IndexT>(handle, params, X, centroids, X_new);
+  kmeans::transform<DataT, IndexT>(handle, params, X, centroids, X_new);
 }
 
 template <typename DataT, typename IndexT = int>
@@ -222,15 +724,14 @@ void kmeans_transform(const raft::handle_t& handle,
                       IndexT n_features,
                       DataT* X_new)
 {
-  detail::kmeans_transform<DataT, IndexT>(
-    handle, params, X, centroids, n_samples, n_features, X_new);
+  kmeans::transform<DataT, IndexT>(handle, params, X, centroids, n_samples, n_features, X_new);
 }
 
-template <typename DataT, typename IndexT = int>
-using SamplingOp = detail::SamplingOp<DataT, IndexT>;
+template <typename DataT, typename IndexT>
+using SamplingOp = kmeans::SamplingOp<DataT, IndexT>;
 
 template <typename IndexT, typename DataT>
-using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
+using KeyValueIndexOp = kmeans::KeyValueIndexOp<IndexT, DataT>;
 
 /**
  * @brief Select centroids according to a sampling operation
@@ -251,16 +752,16 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  * @param[in]  workspace          Temporary workspace buffer which can get resized
  *
  */
-template <typename DataT, typename IndexT = int>
+template <typename DataT, typename IndexT>
 void sampleCentroids(const raft::handle_t& handle,
-                     const raft::device_matrix_view<const DataT, IndexT>& X,
-                     const raft::device_vector_view<DataT, IndexT>& minClusterDistance,
-                     const raft::device_vector_view<IndexT, IndexT>& isSampleCentroid,
+                     raft::device_matrix_view<const DataT, IndexT> X,
+                     raft::device_vector_view<DataT, IndexT> minClusterDistance,
+                     raft::device_vector_view<std::uint8_t, IndexT> isSampleCentroid,
                      SamplingOp<DataT, IndexT>& select_op,
                      rmm::device_uvector<DataT>& inRankCp,
                      rmm::device_uvector<char>& workspace)
 {
-  detail::sampleCentroids<DataT, IndexT>(
+  kmeans::sample_centroids<DataT, IndexT>(
     handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace);
 }
 
@@ -278,14 +779,14 @@ void sampleCentroids(const raft::handle_t& handle,
  * @param[in]  reduction_op       The reduction operation used for the cost
  *
  */
-template <typename DataT, typename ReductionOpT, typename IndexT = int>
+template <typename DataT, typename IndexT, typename ReductionOpT>
 void computeClusterCost(const raft::handle_t& handle,
-                        const raft::device_vector_view<DataT, IndexT>& minClusterDistance,
+                        raft::device_vector_view<DataT, IndexT> minClusterDistance,
                         rmm::device_uvector<char>& workspace,
-                        const raft::device_scalar_view<DataT>& clusterCost,
+                        raft::device_scalar_view<DataT> clusterCost,
                         ReductionOpT reduction_op)
 {
-  detail::computeClusterCost<DataT, ReductionOpT, IndexT>(
+  kmeans::cluster_cost<DataT, ReductionOpT, IndexT>(
     handle, minClusterDistance, workspace, clusterCost, reduction_op);
 }
 
@@ -313,14 +814,14 @@ void computeClusterCost(const raft::handle_t& handle,
 template <typename DataT, typename IndexT>
 void minClusterDistanceCompute(const raft::handle_t& handle,
                                const KMeansParams& params,
-                               const raft::device_matrix_view<const DataT, IndexT>& X,
-                               const raft::device_matrix_view<DataT, IndexT>& centroids,
-                               const raft::device_vector_view<DataT, IndexT>& minClusterDistance,
-                               const raft::device_vector_view<DataT, IndexT>& L2NormX,
+                               raft::device_matrix_view<const DataT, IndexT> X,
+                               raft::device_matrix_view<DataT, IndexT> centroids,
+                               raft::device_vector_view<DataT, IndexT> minClusterDistance,
+                               raft::device_vector_view<DataT, IndexT> L2NormX,
                                rmm::device_uvector<DataT>& L2NormBuf_OR_DistBuf,
                                rmm::device_uvector<char>& workspace)
 {
-  detail::minClusterDistanceCompute<DataT, IndexT>(
+  kmeans::min_cluster_distance<DataT, IndexT>(
     handle, params, X, centroids, minClusterDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace);
 }
 
@@ -352,14 +853,14 @@ template <typename DataT, typename IndexT>
 void minClusterAndDistanceCompute(
   const raft::handle_t& handle,
   const KMeansParams& params,
-  const raft::device_matrix_view<const DataT, IndexT> X,
-  const raft::device_matrix_view<const DataT, IndexT> centroids,
-  const raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT>& minClusterAndDistance,
-  const raft::device_vector_view<DataT, IndexT>& L2NormX,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<const DataT, IndexT> centroids,
+  raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT> minClusterAndDistance,
+  raft::device_vector_view<DataT, IndexT> L2NormX,
   rmm::device_uvector<DataT>& L2NormBuf_OR_DistBuf,
   rmm::device_uvector<char>& workspace)
 {
-  detail::minClusterAndDistanceCompute<DataT, IndexT>(
+  kmeans::min_cluster_and_distance<DataT, IndexT>(
     handle, params, X, centroids, minClusterAndDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace);
 }
 
@@ -382,13 +883,13 @@ void minClusterAndDistanceCompute(
  */
 template <typename DataT, typename IndexT>
 void shuffleAndGather(const raft::handle_t& handle,
-                      const raft::device_matrix_view<const DataT, IndexT>& in,
-                      const raft::device_matrix_view<DataT, IndexT>& out,
+                      raft::device_matrix_view<const DataT, IndexT> in,
+                      raft::device_matrix_view<DataT, IndexT> out,
                       uint32_t n_samples_to_gather,
                       uint64_t seed,
                       rmm::device_uvector<char>* workspace = nullptr)
 {
-  detail::shuffleAndGather<DataT, IndexT>(handle, in, out, n_samples_to_gather, seed, workspace);
+  kmeans::shuffle_and_gather<DataT, IndexT>(handle, in, out, n_samples_to_gather, seed, workspace);
 }
 
 /**
@@ -413,13 +914,13 @@ void shuffleAndGather(const raft::handle_t& handle,
 template <typename DataT, typename IndexT>
 void countSamplesInCluster(const raft::handle_t& handle,
                            const KMeansParams& params,
-                           const raft::device_matrix_view<const DataT, IndexT>& X,
-                           const raft::device_vector_view<DataT, IndexT>& L2NormX,
-                           const raft::device_matrix_view<DataT, IndexT>& centroids,
+                           raft::device_matrix_view<const DataT, IndexT> X,
+                           raft::device_vector_view<DataT, IndexT> L2NormX,
+                           raft::device_matrix_view<DataT, IndexT> centroids,
                            rmm::device_uvector<char>& workspace,
-                           const raft::device_vector_view<DataT, IndexT>& sampleCountInCluster)
+                           raft::device_vector_view<DataT, IndexT> sampleCountInCluster)
 {
-  detail::countSamplesInCluster<DataT, IndexT>(
+  kmeans::count_samples_in_cluster<DataT, IndexT>(
     handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster);
 }
 
@@ -444,11 +945,11 @@ void countSamplesInCluster(const raft::handle_t& handle,
 template <typename DataT, typename IndexT>
 void kmeansPlusPlus(const raft::handle_t& handle,
                     const KMeansParams& params,
-                    const raft::device_matrix_view<const DataT, IndexT>& X,
-                    const raft::device_matrix_view<DataT, IndexT>& centroidsRawData,
+                    raft::device_matrix_view<const DataT, IndexT> X,
+                    raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                     rmm::device_uvector<char>& workspace)
 {
-  detail::kmeansPlusPlus<DataT, IndexT>(handle, params, X, centroidsRawData, workspace);
+  kmeans::init_plus_plus<DataT, IndexT>(handle, params, X, centroidsRawData, workspace);
 }
 
 /*
@@ -477,14 +978,14 @@ void kmeansPlusPlus(const raft::handle_t& handle,
 template <typename DataT, typename IndexT>
 void kmeans_fit_main(const raft::handle_t& handle,
                      const KMeansParams& params,
-                     const raft::device_matrix_view<const DataT, IndexT>& X,
-                     const raft::device_vector_view<const DataT, IndexT>& weight,
-                     const raft::device_matrix_view<DataT, IndexT>& centroidsRawData,
-                     const raft::host_scalar_view<DataT>& inertia,
-                     const raft::host_scalar_view<IndexT>& n_iter,
+                     raft::device_matrix_view<const DataT, IndexT> X,
+                     raft::device_vector_view<const DataT, IndexT> weight,
+                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
+                     raft::host_scalar_view<DataT> inertia,
+                     raft::host_scalar_view<IndexT> n_iter,
                      rmm::device_uvector<char>& workspace)
 {
-  detail::kmeans_fit_main<DataT, IndexT>(
+  kmeans::fit_main<DataT, IndexT>(
     handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace);
 }
-}  // namespace raft::cluster
+};  // namespace raft::cluster
diff --git a/cpp/include/raft/cluster/kmeans_params.hpp b/cpp/include/raft/cluster/kmeans_params.hpp
index 433e32f5ff..a1532d9dd4 100644
--- a/cpp/include/raft/cluster/kmeans_params.hpp
+++ b/cpp/include/raft/cluster/kmeans_params.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,15 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-/**
- * DISCLAIMER: this file is deprecated: use lap.cuh instead
- */
-
 #pragma once
 
 #pragma message(__FILE__                                                  \
diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp
index 87fc7c1880..f411b12b5c 100644
--- a/cpp/include/raft/cluster/kmeans_types.hpp
+++ b/cpp/include/raft/cluster/kmeans_types.hpp
@@ -18,17 +18,36 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng_state.hpp>
 
-namespace raft {
-namespace cluster {
+namespace raft::cluster::kmeans {
 
+/**
+ * Simple object to specify hyper-parameters to the kmeans algorithm.
+ */
 struct KMeansParams {
-  enum InitMethod { KMeansPlusPlus, Random, Array };
+  enum InitMethod {
+
+    /**
+     * Sample the centroids using the kmeans++ strategy
+     */
+    KMeansPlusPlus,
+
+    /**
+     * Sample the centroids uniformly at random
+     */
+    Random,
 
-  // The number of clusters to form as well as the number of centroids to
-  // generate (default:8).
+    /**
+     * User provides the array of initial centroids
+     */
+    Array
+  };
+
+  /**
+   * The number of clusters to form as well as the number of centroids to generate (default:8).
+   */
   int n_clusters = 8;
 
-  /*
+  /**
    * Method for initialization, defaults to k-means++:
    *  - InitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm
    * to select the initial cluster centers.
@@ -38,36 +57,60 @@ struct KMeansParams {
    */
   InitMethod init = KMeansPlusPlus;
 
-  // Maximum number of iterations of the k-means algorithm for a single run.
+  /**
+   * Maximum number of iterations of the k-means algorithm for a single run.
+   */
   int max_iter = 300;
 
-  // Relative tolerance with regards to inertia to declare convergence.
+  /**
+   * Relative tolerance with regards to inertia to declare convergence.
+   */
   double tol = 1e-4;
 
-  // verbosity level.
+  /**
+   * verbosity level.
+   */
   int verbosity = RAFT_LEVEL_INFO;
 
-  // Seed to the random number generator.
+  /**
+   * Seed to the random number generator.
+   */
   raft::random::RngState rng_state =
     raft::random::RngState(0, raft::random::GeneratorType::GenPhilox);
 
-  // Metric to use for distance computation.
+  /**
+   * Metric to use for distance computation.
+   */
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
 
-  // Number of instance k-means algorithm will be run with different seeds.
+  /**
+   * Number of instance k-means algorithm will be run with different seeds.
+   */
   int n_init = 1;
 
-  // Oversampling factor for use in the k-means|| algorithm.
+  /**
+   * Oversampling factor for use in the k-means|| algorithm
+   */
   double oversampling_factor = 2.0;
 
   // batch_samples and batch_centroids are used to tile 1NN computation which is
   // useful to optimize/control the memory footprint
   // Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0
   // then don't tile the centroids
-  int batch_samples   = 1 << 15;
-  int batch_centroids = 0;  // if 0 then batch_centroids = n_clusters
+  int batch_samples = 1 << 15;
+
+  /**
+   * if 0 then batch_centroids = n_clusters
+   */
+  int batch_centroids = 0;  //
 
   bool inertia_check = false;
 };
-}  // namespace cluster
-}  // namespace raft
+
+}  // namespace raft::cluster::kmeans
+
+namespace raft::cluster {
+
+using kmeans::KMeansParams;
+
+}  // namespace raft::cluster
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index 8e33b8389d..2d74c364b2 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -21,7 +21,11 @@
 
 namespace raft::cluster {
 
-constexpr int DEFAULT_CONST_C = 15;
+/**
+ * Note: All of the functions below in the raft::cluster namespace are deprecated
+ * and will be removed in a future release. Please use raft::cluster::hierarchy
+ * instead.
+ */
 
 /**
  * Single-linkage clustering, capable of constructing a KNN graph to
@@ -58,6 +62,11 @@ void single_linkage(const raft::handle_t& handle,
   detail::single_linkage<value_idx, value_t, dist_type>(
     handle, X, m, n, metric, out, c, n_clusters);
 }
+};  // namespace raft::cluster
+
+namespace raft::cluster::hierarchy {
+
+constexpr int DEFAULT_CONST_C = 15;
 
 /**
  * Single-linkage clustering, capable of constructing a KNN graph to
@@ -90,14 +99,14 @@ void single_linkage(const raft::handle_t& handle,
   out_arrs.children = dendrogram.data_handle();
   out_arrs.labels   = labels.data_handle();
 
-  single_linkage<idx_t, value_t, dist_type>(handle,
-                                            X.data_handle(),
-                                            static_cast<std::size_t>(X.extent(0)),
-                                            static_cast<std::size_t>(X.extent(1)),
-                                            metric,
-                                            &out_arrs,
-                                            c.has_value() ? c.value() : DEFAULT_CONST_C,
-                                            n_clusters);
+  raft::cluster::single_linkage<idx_t, value_t, dist_type>(
+    handle,
+    X.data_handle(),
+    static_cast<std::size_t>(X.extent(0)),
+    static_cast<std::size_t>(X.extent(1)),
+    metric,
+    &out_arrs,
+    c.has_value() ? c.value() : DEFAULT_CONST_C,
+    n_clusters);
 }
-
-};  // namespace raft::cluster
+};  // namespace raft::cluster::hierarchy
diff --git a/cpp/include/raft/cluster/single_linkage_types.hpp b/cpp/include/raft/cluster/single_linkage_types.hpp
index 79f2ede482..9a4fcfef60 100644
--- a/cpp/include/raft/cluster/single_linkage_types.hpp
+++ b/cpp/include/raft/cluster/single_linkage_types.hpp
@@ -18,12 +18,36 @@
 
 #include <raft/core/device_mdspan.hpp>
 
+namespace raft::cluster::hierarchy {
+
+/**
+ * Determines the method for computing the minimum spanning tree (MST)
+ */
+enum LinkageDistance {
+
+  /**
+   * Use a pairwise distance matrix as input to the mst. This
+   * is very fast and the best option for fairly small datasets (~50k data points)
+   */
+  PAIRWISE = 0,
+
+  /**
+   * Construct a KNN graph as input to the mst and provide additional
+   * edges if the mst does not converge. This is slower but scales
+   * to very large datasets.
+   */
+  KNN_GRAPH = 1
+};
+
+};  // end namespace raft::cluster::hierarchy
+
+// The code below is now considered legacy
 namespace raft::cluster {
 
-enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 };
+using hierarchy::LinkageDistance;
 
 /**
- * Simple POCO for consolidating linkage results. This closely
+ * Simple container object for consolidating linkage results. This closely
  * mirrors the trained instance variables populated in
  * Scikit-learn's AgglomerativeClustering estimator.
  * @tparam value_idx
@@ -58,4 +82,4 @@ class linkage_output_int : public linkage_output<int> {
 class linkage_output_int64 : public linkage_output<int64_t> {
 };
 
-};  // namespace raft::cluster
\ No newline at end of file
+};  // namespace raft::cluster
diff --git a/cpp/include/raft/core/detail/device_mdarray.hpp b/cpp/include/raft/core/detail/device_mdarray.hpp
index ff7c31000d..ad6831794e 100644
--- a/cpp/include/raft/core/detail/device_mdarray.hpp
+++ b/cpp/include/raft/core/detail/device_mdarray.hpp
@@ -25,8 +25,8 @@
 #include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-#include <raft/core/detail/host_device_accessor.hpp>
 #include <raft/core/detail/span.hpp>  // dynamic_extent
+#include <raft/core/host_device_accessor.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index 05cf767a53..394ea228b4 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -17,16 +17,16 @@
 #pragma once
 
 #include <cstdint>
-#include <raft/core/detail/host_device_accessor.hpp>
+#include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 
 namespace raft {
 
 template <typename AccessorPolicy>
-using device_accessor = detail::host_device_accessor<AccessorPolicy, false, true>;
+using device_accessor = host_device_accessor<AccessorPolicy, false, true>;
 
 template <typename AccessorPolicy>
-using managed_accessor = detail::host_device_accessor<AccessorPolicy, true, true>;
+using managed_accessor = host_device_accessor<AccessorPolicy, true, true>;
 
 /**
  * @brief std::experimental::mdspan with device tag to avoid accessing incorrect memory location.
@@ -162,6 +162,51 @@ template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous>
 using device_matrix_view = device_mdspan<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
 
+/**
+ * @brief Shorthand for 128 byte aligned device matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>,
+          typename              = enable_if_layout_padded<ElementType, LayoutPolicy>>
+using device_aligned_matrix_view =
+  device_mdspan<ElementType,
+                matrix_extent<IndexType>,
+                LayoutPolicy,
+                std::experimental::aligned_accessor<ElementType, detail::alignment::value>>;
+
+/**
+ * @brief Create a 2-dim 128 byte aligned mdspan instance for device pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>>
+auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  using data_handle_type =
+    typename std::experimental::aligned_accessor<ElementType,
+                                                 detail::alignment::value>::data_handle_type;
+  static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
+  assert(ptr == alignTo(ptr, detail::alignment::value));
+
+  data_handle_type aligned_pointer = ptr;
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return device_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer, extents};
+}
+
 /**
  * @brief Create a raft::managed_mdspan
  * @tparam ElementType the data type of the matrix elements
diff --git a/cpp/include/raft/core/detail/host_device_accessor.hpp b/cpp/include/raft/core/host_device_accessor.hpp
similarity index 86%
rename from cpp/include/raft/core/detail/host_device_accessor.hpp
rename to cpp/include/raft/core/host_device_accessor.hpp
index 3a71e6366b..4f6f559be4 100644
--- a/cpp/include/raft/core/detail/host_device_accessor.hpp
+++ b/cpp/include/raft/core/host_device_accessor.hpp
@@ -16,10 +16,12 @@
 
 #pragma once
 
-namespace raft::detail {
+namespace raft {
 
 /**
- * @brief A mixin to distinguish host and device memory.
+ * @brief A mixin to distinguish host and device memory. This is the primary
+ * accessor used throught RAFT's APIs to denote whether an underlying pointer
+ * is accessible from device, host, or both.
  */
 template <typename AccessorPolicy, bool is_host, bool is_device>
 struct host_device_accessor : public AccessorPolicy {
@@ -36,4 +38,4 @@ struct host_device_accessor : public AccessorPolicy {
   host_device_accessor(AccessorPolicy const& that) : AccessorPolicy{that} {}  // NOLINT
 };
 
-}  // namespace raft::detail
+}  // namespace raft
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index 3fe9ea2264..0b49ca9945 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -19,12 +19,12 @@
 #include <cstdint>
 #include <raft/core/mdspan.hpp>
 
-#include <raft/core/detail/host_device_accessor.hpp>
+#include <raft/core/host_device_accessor.hpp>
 
 namespace raft {
 
 template <typename AccessorPolicy>
-using host_accessor = detail::host_device_accessor<AccessorPolicy, true, false>;
+using host_accessor = host_device_accessor<AccessorPolicy, true, false>;
 
 /**
  * @brief std::experimental::mdspan with host tag to avoid accessing incorrect memory location.
@@ -111,6 +111,51 @@ template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous>
 using host_matrix_view = host_mdspan<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
 
+/**
+ * @brief Shorthand for 128 byte aligned host matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>,
+          typename              = enable_if_layout_padded<ElementType, LayoutPolicy>>
+using host_aligned_matrix_view =
+  host_mdspan<ElementType,
+              matrix_extent<IndexType>,
+              LayoutPolicy,
+              std::experimental::aligned_accessor<ElementType, detail::alignment::value>>;
+
+/**
+ * @brief Create a 2-dim 128 byte aligned mdspan instance for host pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr on host to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>>
+auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  using data_handle_type =
+    typename std::experimental::aligned_accessor<ElementType,
+                                                 detail::alignment::value>::data_handle_type;
+
+  static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
+  assert(ptr == alignTo(ptr, detail::alignment::value));
+  data_handle_type aligned_pointer = ptr;
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return host_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer, extents};
+}
+
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for host value.
  *
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
index 44c8263abf..3984ec042a 100644
--- a/cpp/include/raft/core/logger.hpp
+++ b/cpp/include/raft/core/logger.hpp
@@ -78,7 +78,9 @@ namespace detail {
  */
 inline std::string format(const char* fmt, va_list& vl)
 {
-  int length = std::vsnprintf(nullptr, 0, fmt, vl);
+  va_list vl_copy;
+  va_copy(vl_copy, vl);
+  int length = std::vsnprintf(nullptr, 0, fmt, vl_copy);
   assert(length >= 0);
   std::vector<char> buf(length + 1);
   std::vsnprintf(buf.data(), length + 1, fmt, vl);
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 44730d901e..ae5d236395 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -24,8 +24,8 @@
 
 #include <stddef.h>
 
-#include <raft/core/detail/host_device_accessor.hpp>
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -154,13 +154,12 @@ class mdarray
               std::conditional_t<std::is_const_v<E>,
                                  typename container_policy_type::const_accessor_policy,
                                  typename container_policy_type::accessor_policy>>
-  using view_type_impl =
-    mdspan<E,
-           extents_type,
-           layout_type,
-           detail::host_device_accessor<ViewAccessorPolicy,
-                                        container_policy_type::is_host_accessible,
-                                        container_policy_type::is_device_accessible>>;
+  using view_type_impl = mdspan<E,
+                                extents_type,
+                                layout_type,
+                                host_device_accessor<ViewAccessorPolicy,
+                                                     container_policy_type::is_host_accessible,
+                                                     container_policy_type::is_device_accessible>>;
 
  public:
   /**
diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp
index a858633e07..1faac44cc8 100644
--- a/cpp/include/raft/core/mdspan.hpp
+++ b/cpp/include/raft/core/mdspan.hpp
@@ -18,9 +18,9 @@
 #include <raft/core/error.hpp>
 #include <raft/core/mdspan_types.hpp>
 
-#include <raft/core/detail/host_device_accessor.hpp>
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/detail/mdspan_util.cuh>
+#include <raft/core/host_device_accessor.hpp>
 
 #include <raft/thirdparty/mdspan/include/experimental/mdspan>
 
@@ -32,6 +32,40 @@ template <typename ElementType,
           typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
 using mdspan = std::experimental::mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
 
+namespace detail {
+
+// keeping ByteAlignment as optional to allow testing
+template <class ValueType, size_t ByteAlignment = 128>
+struct padding {
+  static_assert(std::is_same<std::remove_cv_t<ValueType>, ValueType>::value,
+                "std::experimental::padding ValueType has to be provided without "
+                "const or volatile specifiers.");
+  static_assert(ByteAlignment % sizeof(ValueType) == 0 || sizeof(ValueType) % ByteAlignment == 0,
+                "std::experimental::padding sizeof(ValueType) has to be multiple or "
+                "divider of ByteAlignment.");
+  static constexpr size_t value = std::max(ByteAlignment / sizeof(ValueType), 1ul);
+};
+
+// alignment fixed to 128 bytes
+struct alignment {
+  static constexpr size_t value = 128;
+};
+
+}  // namespace detail
+
+template <typename ElementType>
+using layout_right_padded = std::experimental::layout_right_padded<
+  detail::padding<std::remove_cv_t<std::remove_reference_t<ElementType>>>::value>;
+
+template <typename ElementType>
+using layout_left_padded = std::experimental::layout_left_padded<
+  detail::padding<std::remove_cv_t<std::remove_reference_t<ElementType>>>::value>;
+
+template <typename ElementType, typename LayoutPolicy>
+using enable_if_layout_padded =
+  std::enable_if_t<std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                   std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value>;
+
 /**
  * Ensure all types listed in the parameter pack `Extents` are integral types.
  * Usage:
@@ -149,10 +183,9 @@ template <typename ElementType,
           size_t... Extents>
 auto make_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
 {
-  using accessor_type =
-    detail::host_device_accessor<std::experimental::default_accessor<ElementType>,
-                                 is_host_accessible,
-                                 is_device_accessible>;
+  using accessor_type = host_device_accessor<std::experimental::default_accessor<ElementType>,
+                                             is_host_accessible,
+                                             is_device_accessible>;
 
   return mdspan<ElementType, decltype(exts), LayoutPolicy, accessor_type>{ptr, exts};
 }
@@ -255,4 +288,5 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx,
     return unravel_index_impl<uint32_t>(static_cast<uint32_t>(idx), shape);
   }
 }
+
 }  // namespace raft
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
new file mode 100644
index 0000000000..54ac490ca4
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.cuh>
+
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels::detail {
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+template <typename math_t>
+class GramMatrixBase {
+  cublasHandle_t cublas_handle;
+
+ public:
+  GramMatrixBase(cublasHandle_t cublas_handle) : cublas_handle(cublas_handle){};
+
+  virtual ~GramMatrixBase(){};
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  virtual void operator()(const math_t* x1,
+                          int n1,
+                          int n_cols,
+                          const math_t* x2,
+                          int n2,
+                          math_t* out,
+                          bool is_row_major,
+                          cudaStream_t stream,
+                          int ld1    = 0,
+                          int ld2    = 0,
+                          int ld_out = 0)
+  {
+    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  virtual void evaluate(const math_t* x1,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld1,
+                        int ld2,
+                        int ld_out)
+  {
+    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  // private:
+  // The following methods should be private, they are kept public to avoid:
+  // "error: The enclosing parent function ("distance") for an extended
+  // __device__ lambda cannot have private or protected access within its class"
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of colums (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  void linear(const math_t* x1,
+              int n1,
+              int n_cols,
+              const math_t* x2,
+              int n2,
+              math_t* out,
+              bool is_row_major,
+              cudaStream_t stream,
+              int ld1,
+              int ld2,
+              int ld_out)
+  {
+    math_t alpha = 1.0;
+    math_t beta  = 0.0;
+    if (is_row_major) {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                       CUBLAS_OP_T,
+                                                       CUBLAS_OP_N,
+                                                       n2,
+                                                       n1,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x2,
+                                                       ld2,
+                                                       x1,
+                                                       ld1,
+                                                       &beta,
+                                                       out,
+                                                       ld_out,
+                                                       stream));
+    } else {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       n1,
+                                                       n2,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x1,
+                                                       ld1,
+                                                       x2,
+                                                       ld2,
+                                                       &beta,
+                                                       out,
+                                                       ld_out,
+                                                       stream));
+    }
+  }
+
+  /** Calculates the Gram matrix using Euclidean distance.
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  virtual void distance(const math_t* x1,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld1,
+                        int ld2,
+                        int ld_out)
+  {
+    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t, math_t, math_t>(
+      x1, x2, out, n1, n2, n_cols, stream, is_row_major);
+  }
+};
+};  // end namespace raft::distance::kernels::detail
\ No newline at end of file
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
new file mode 100644
index 0000000000..1aa6809bcd
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gram_matrix.cuh"
+#include "kernel_matrices.cuh"
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::distance::kernels::detail {
+
+template <typename math_t>
+class KernelFactory {
+ public:
+  static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t cublas_handle)
+  {
+    GramMatrixBase<math_t>* res;
+    // KernelParams is not templated, we convert the parameters to math_t here:
+    math_t coef0 = params.coef0;
+    math_t gamma = params.gamma;
+    switch (params.kernel) {
+      case LINEAR: res = new GramMatrixBase<math_t>(cublas_handle); break;
+      case POLYNOMIAL:
+        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, cublas_handle);
+        break;
+      case TANH: res = new TanhKernel<math_t>(gamma, coef0, cublas_handle); break;
+      case RBF: res = new RBFKernel<math_t>(gamma); break;
+      default: throw raft::exception("Kernel not implemented");
+    }
+    return res;
+  }
+};
+
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
new file mode 100644
index 0000000000..6d59e1c7c5
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gram_matrix.cuh"
+#include <raft/util/cuda_utils.cuh>
+
+#include <raft/distance/distance.cuh>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels::detail {
+
+/** Epiloge function for polynomial kernel without padding.
+ * Calculates output = (gain*in + offset)^exponent
+ * @param inout device vector in column major format, size [len]
+ * @param len array length
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+__global__ void polynomial_kernel_nopad(
+  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = pow(gain * inout[tid] + offset, exponent);
+  }
+}
+
+/** Epiloge function for polynomial kernel with padding.
+ * Calculates output = (gain*input + offset)^exponent
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of colums
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+__global__ void polynomial_kernel(
+  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
+    }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = tanh(gain * inout[tid] + offset);
+  }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of colums
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
+    }
+}
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+class PolynomialKernel : public GramMatrixBase<math_t> {
+  exp_t exponent;
+  math_t gain;
+  math_t offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  {
+    const int n_minor = is_row_major ? cols : rows;
+    if (ld == n_minor) {
+      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+        inout, rows * cols, exponent, gain, offset);
+    } else {
+      int n1 = is_row_major ? cols : rows;
+      int n2 = is_row_major ? rows : cols;
+      polynomial_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                          dim3(32, 4, 1),
+                          0,
+                          stream>>>(inout, ld, n1, n2, exponent, gain, offset);
+    }
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+ public:
+  /**
+   * Constructs a polynomial kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
+   *
+   * @tparam math_t floating point type
+   * @tparam exp_t type of exponent
+   * @param exponent
+   * @param gain
+   * @param offset
+   * @param cublas_handle
+   */
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t cublas_handle)
+    : GramMatrixBase<math_t>(cublas_handle), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  void evaluate(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    GramMatrixBase<math_t>::linear(
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+};
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+class TanhKernel : public GramMatrixBase<math_t> {
+  math_t gain, offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  {
+    const int n_minor = is_row_major ? cols : rows;
+    if (ld == n_minor) {
+      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+        inout, rows * cols, gain, offset);
+    } else {
+      int n1 = is_row_major ? cols : rows;
+      int n2 = is_row_major ? rows : cols;
+      tanh_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                    dim3(32, 4, 1),
+                    0,
+                    stream>>>(inout, ld, n1, n2, gain, offset);
+    }
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+ public:
+  /**
+   * Constructs a tanh kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   * @param offset
+   * @param cublas_handle
+   */
+  TanhKernel(math_t gain, math_t offset, cublasHandle_t cublas_handle)
+    : GramMatrixBase<math_t>(cublas_handle), gain(gain), offset(offset)
+  {
+  }
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 device array of vectors,
+   *  size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] x2 device array of vectors,
+   *   size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  void evaluate(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    GramMatrixBase<math_t>::linear(
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+};
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+class RBFKernel : public GramMatrixBase<math_t> {
+  math_t gain;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  {
+    const int n_minor = is_row_major ? cols : rows;
+    if (ld == n_minor) {
+      rbf_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+        inout, rows * cols, gain);
+    } else {
+      int n1 = is_row_major ? cols : rows;
+      int n2 = is_row_major ? rows : cols;
+      rbf_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                   dim3(32, 4, 1),
+                   0,
+                   stream>>>(inout, ld, n1, n2, gain);
+    }
+  }
+
+ public:
+  /**
+   * Constructs a RBF kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = exp(-gain*|x1_i- x2_k|^2)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   */
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(NULL), gain(gain) {}
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported
+   * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported
+   * @param ld_out leading dimension of out, only ld_out == n1 is supported
+   */
+  void evaluate(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    int minor1    = is_row_major ? n_cols : n1;
+    int minor2    = is_row_major ? n_cols : n2;
+    int minor_out = is_row_major ? n2 : n1;
+    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+    distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  /** Customize distance function withe RBF epilogue */
+  void distance(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    math_t gain   = this->gain;
+    using index_t = int64_t;
+
+    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
+    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
+                             math_t,
+                             math_t,
+                             math_t,
+                             decltype(fin_op),
+                             index_t>(const_cast<math_t*>(x1),
+                                      const_cast<math_t*>(x2),
+                                      out,
+                                      n1,
+                                      n2,
+                                      n_cols,
+                                      NULL,
+                                      0,
+                                      fin_op,
+                                      stream,
+                                      is_row_major);
+  }
+};
+
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/distance_types.hpp b/cpp/include/raft/distance/distance_types.hpp
index f75263b00d..f5ed68af4a 100644
--- a/cpp/include/raft/distance/distance_types.hpp
+++ b/cpp/include/raft/distance/distance_types.hpp
@@ -65,5 +65,26 @@ enum DistanceType : unsigned short {
   /** Precomputed (special value) **/
   Precomputed = 100
 };
+
+namespace kernels {
+enum KernelType { LINEAR, POLYNOMIAL, RBF, TANH };
+
+/**
+ * Parameters for kernel matrices.
+ * The following kernels are implemented:
+ * - LINEAR \f[ K(x_1,x_2) = <x_1,x_2>, \f] where \f$< , >\f$ is the dot product
+ * - POLYNOMIAL \f[ K(x_1, x_2) = (\gamma <x_1,x_2> + \mathrm{coef0})^\mathrm{degree} \f]
+ * - RBF \f[ K(x_1, x_2) = \exp(- \gamma |x_1-x_2|^2) \f]
+ * - TANH \f[ K(x_1, x_2) = \tanh(\gamma <x_1,x_2> + \mathrm{coef0}) \f]
+ */
+struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};
+}  // end namespace kernels
+
 };  // namespace distance
 };  // end namespace raft
diff --git a/cpp/include/raft/distance/kernels.cuh b/cpp/include/raft/distance/kernels.cuh
new file mode 100644
index 0000000000..86f9f82406
--- /dev/null
+++ b/cpp/include/raft/distance/kernels.cuh
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/kernel_factory.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <raft/distance/distance.cuh>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels {
+
+// TODO: Need to expose formal APIs for this that are more consistent w/ other APIs in RAFT
+using raft::distance::kernels::detail::GramMatrixBase;
+using raft::distance::kernels::detail::KernelFactory;
+
+};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh
new file mode 100644
index 0000000000..75c9c023e8
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/kernels.cuh
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+extern template class raft::distance::kernels::detail::GramMatrixBase<double>;
+extern template class raft::distance::kernels::detail::GramMatrixBase<float>;
+
+extern template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
+extern template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
+
+extern template class raft::distance::kernels::detail::TanhKernel<double>;
+extern template class raft::distance::kernels::detail::TanhKernel<float>;
+
+// These are somehow missing a kernel definition which is causing a compile error
+// extern template class raft::distance::kernels::detail::RBFKernel<double>;
+// extern template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index 73d075f260..053441d68a 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -23,6 +23,7 @@
 #include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
 #include <raft/distance/specializations/detail/hellinger_expanded.cuh>
 #include <raft/distance/specializations/detail/jensen_shannon.cuh>
+#include <raft/distance/specializations/detail/kernels.cuh>
 #include <raft/distance/specializations/detail/kl_divergence.cuh>
 #include <raft/distance/specializations/detail/l1.cuh>
 #include <raft/distance/specializations/detail/l2_expanded.cuh>
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index 9ddcbae20b..dc92271141 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -92,48 +92,48 @@ struct quadSum {
 
 #define SUM_ROWS_SMALL_K_DIMX         256
 #define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4
-template <typename DataIteratorT, typename WeightT>
+template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
 __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
 
-  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT* d_A,
-                                                     int lda,
+  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+                                                     IdxT lda,
                                                      const char* d_keys,
                                                      const WeightT* d_weights,
-                                                     int nrows,
-                                                     int ncols,
-                                                     int nkeys,
-                                                     DataIteratorT* d_sums)
+                                                     IdxT nrows,
+                                                     IdxT ncols,
+                                                     IdxT nkeys,
+                                                     SumsT* d_sums)
 {
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
-  typedef cub::BlockReduce<quad<DataType>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  typedef cub::BlockReduce<quad<SumsT>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  for (int idim = static_cast<int>(blockIdx.y); idim < ncols; idim += gridDim.y) {
-    if (idim != static_cast<int>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
+  for (IdxT idim = static_cast<IdxT>(blockIdx.y); idim < ncols; idim += gridDim.y) {
+    if (idim != static_cast<IdxT>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
 
     // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg
-    quad<DataType> thread_sums;
+    quad<SumsT> thread_sums;
     thread_sums.x = 0.0;
     thread_sums.y = 0.0;
     thread_sums.z = 0.0;
     thread_sums.w = 0.0;
 
     // May use vectorized load - not necessary for doubles
-    for (int block_offset_irow = blockIdx.x * blockDim.x;
+    for (IdxT block_offset_irow = blockIdx.x * blockDim.x;
          block_offset_irow < nrows;  // we will syncthreads() inside the loop, no CTA divergence
          block_offset_irow += blockDim.x * gridDim.x) {
-      int irow     = block_offset_irow + threadIdx.x;
+      IdxT irow    = block_offset_irow + threadIdx.x;
       DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0;
       if (d_weights && irow < nrows) { val = val * d_weights[irow]; }
       // we are not reusing the keys - after profiling
       // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded
       // (experimentation gave a 10% speed up - not worth the many code lines added)
-      int row_key = (irow < nrows) ? d_keys[irow] : -1;
+      IdxT row_key = (irow < nrows) ? d_keys[irow] : std::numeric_limits<IdxT>::max();
 
-      thread_sums.x += (row_key == 0) ? val : 0.0;
-      thread_sums.y += (row_key == 1) ? val : 0.0;
-      thread_sums.z += (row_key == 2) ? val : 0.0;
-      thread_sums.w += (row_key == 3) ? val : 0.0;
+      thread_sums.x += (row_key == 0) ? static_cast<SumsT>(val) : 0.0;
+      thread_sums.y += (row_key == 1) ? static_cast<SumsT>(val) : 0.0;
+      thread_sums.z += (row_key == 2) ? static_cast<SumsT>(val) : 0.0;
+      thread_sums.w += (row_key == 3) ? static_cast<SumsT>(val) : 0.0;
     }
 
     // End of column
@@ -142,12 +142,12 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
     // Strided access
 
     // Reducing by key
-    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<DataType>());
+    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<SumsT>());
 
     if (threadIdx.x < 32) {
       // We only need 4
       thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff);
-      if (static_cast<int>(threadIdx.x) < nkeys) {
+      if (static_cast<IdxT>(threadIdx.x) < nkeys) {
         if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x);
         if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y);
         if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z);
@@ -157,22 +157,22 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
   }
 }
 
-template <typename DataIteratorT, typename WeightT>
-void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A,
-                                 int lda,
+template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
+void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
+                                 IdxT lda,
                                  const char* d_keys,
                                  const WeightT* d_weights,
-                                 int nrows,
-                                 int ncols,
-                                 int nkeys,
-                                 DataIteratorT* d_sums,
+                                 IdxT nrows,
+                                 IdxT ncols,
+                                 IdxT nkeys,
+                                 SumsT* d_sums,
                                  cudaStream_t st)
 {
   dim3 grid, block;
   block.x = SUM_ROWS_SMALL_K_DIMX;
   block.y = 1;  // Necessary
 
-  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = raft::ceildiv(nrows, (IdxT)block.x);
   grid.x = std::min(grid.x, 32u);
   grid.y = ncols;
   grid.y = std::min(grid.y, MAX_BLOCKS);
@@ -188,45 +188,49 @@ void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A,
 
 #define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024
 
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT* d_A,
-                                                            int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+                                                            IdxT lda,
                                                             KeysIteratorT d_keys,
                                                             const WeightT* d_weights,
-                                                            int nrows,
-                                                            int ncols,
+                                                            IdxT nrows,
+                                                            IdxT ncols,
                                                             int key_offset,
-                                                            int nkeys,
-                                                            DataIteratorT* d_sums)
+                                                            IdxT nkeys,
+                                                            SumsT* d_sums)
 {
   typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
-  __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  __shared__ SumsT local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
 
-  for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
+  for (IdxT local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
     local_sums[local_key] = 0.0;
 
-  for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
+  for (IdxT idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
     __syncthreads();  // local_sums
 
     // At this point local_sums if full of zeros
 
-    for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
+    for (IdxT irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
          irow += blockDim.x * gridDim.x) {
       // Branch div in this loop - not an issue with current code
       DataType val = d_A[idim * lda + irow];
       if (d_weights) val = val * d_weights[irow];
 
-      int local_key = d_keys[irow] - key_offset;
+      IdxT local_key = d_keys[irow] - key_offset;
 
       // We could load next val here
-      raft::myAtomicAdd(&local_sums[local_key], val);
+      raft::myAtomicAdd(&local_sums[local_key], static_cast<SumsT>(val));
     }
 
     __syncthreads();  // local_sums
 
-    for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
-      DataType local_sum = local_sums[local_key];
+    for (IdxT local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
+      SumsT local_sum = local_sums[local_key];
 
       if (local_sum != 0.0) {
         KeyType global_key = key_offset + local_key;
@@ -237,22 +241,22 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT*
   }
 }
 
-template <typename DataIteratorT, typename KeysIteratorT>
-void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A,
-                                          int lda,
+template <typename DataIteratorT, typename KeysIteratorT, typename SumsT, typename IdxT>
+void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A,
+                                          IdxT lda,
                                           KeysIteratorT d_keys,
-                                          int nrows,
-                                          int ncols,
+                                          IdxT nrows,
+                                          IdxT ncols,
                                           int key_offset,
-                                          int nkeys,
-                                          DataIteratorT* d_sums,
+                                          IdxT nkeys,
+                                          SumsT* d_sums,
                                           cudaStream_t st)
 {
   dim3 grid, block;
   block.x = SUM_ROWS_SMALL_K_DIMX;
   block.y = 1;  // Necessary
 
-  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = raft::ceildiv(nrows, (IdxT)block.x);
   grid.x = std::min(grid.x, 32u);
   grid.y = ncols;
   grid.y = std::min(grid.y, MAX_BLOCKS);
@@ -260,91 +264,47 @@ void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A,
     d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
 }
 
-#define RRBK_SHMEM_SZ 32
-
-//#define RRBK_SHMEM
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT* d_A,
-                                                            int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+                                                            IdxT lda,
                                                             const WeightT* d_weights,
                                                             KeysIteratorT d_keys,
-                                                            int nrows,
-                                                            int ncols,
-                                                            int key_offset,
-                                                            int nkeys,
-                                                            DataIteratorT* d_sums)
+                                                            IdxT nrows,
+                                                            IdxT ncols,
+                                                            SumsT* d_sums)
 {
-  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
-
-#ifdef RRBK_SHMEM
-  __shared__ KeyType sh_keys[RRBK_SHMEM_SZ];
-#endif
-  int rows_per_partition = nrows / gridDim.z + 1;
-  int start_row          = blockIdx.z * rows_per_partition;
-  int end_row            = start_row + rows_per_partition;
-  end_row                = end_row > nrows ? nrows : end_row;
-
-  KeyType local_key = blockIdx.y;
-  if (local_key >= nkeys) return;
-  int this_col = threadIdx.x + blockIdx.x * blockDim.x;
-  if (this_col >= ncols) return;
-
-  DataType sum       = 0.0;
-  KeyType global_key = key_offset + local_key;
-#ifdef RRBK_SHMEM
-  int sh_key_inx = 0;
-#endif
-  for (int r = start_row; r < end_row; r++) {
-#ifdef RRBK_SHMEM
-    if (0 == sh_key_inx % RRBK_SHMEM_SZ) {
-      for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x)
-        sh_keys[x] = d_keys[r + x];
-      __syncthreads();
-    }
-    if (sh_keys[sh_key_inx] != global_key) continue;  // No divergence since global_key is the
-    // same for the whole block
-    sh_key_inx++;
-#else
-    if (d_keys[r] != global_key)
-      continue;  // No divergence since global_key is the
-                 // same for the whole block
-#endif
-    // if ((end_row-start_row) / (r-start_row) != global_key) continue;
-    DataType val = __ldcg(&d_A[r * lda + this_col]);
-    if (d_weights) { val = val * d_weights[r]; }
-    sum += val;
-  }
-
-  if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum);
+  IdxT gid = threadIdx.x + (blockDim.x * static_cast<IdxT>(blockIdx.x));
+  IdxT j   = gid % ncols;
+  IdxT i   = gid / ncols;
+  if (i >= nrows) return;
+  IdxT l    = static_cast<IdxT>(d_keys[i]);
+  SumsT val = d_A[j + lda * i];
+  if (d_weights != nullptr) val *= d_weights[i];
+  raft::myAtomicAdd(&d_sums[j + ncols * l], val);
 }
 
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A,
-                                          int lda,
-                                          KeysIteratorT d_keys,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A,
+                                          IdxT lda,
+                                          const KeysIteratorT d_keys,
                                           const WeightT* d_weights,
-                                          int nrows,
-                                          int ncols,
-                                          int key_offset,
-                                          int nkeys,
-                                          DataIteratorT* d_sums,
+                                          IdxT nrows,
+                                          IdxT ncols,
+                                          SumsT* d_sums,
                                           cudaStream_t st)
 {
-  // x-dim refers to the column in the input data
-  // y-dim refers to the key
-  // z-dim refers to a partitioning of the rows among the threadblocks
-  dim3 grid, block;
-  block.x = 256;  // Adjust me!
-  block.y = 1;    // Don't adjust me!
-  grid.x  = raft::ceildiv(ncols, (int)block.x);
-  grid.y  = nkeys;
-  grid.z  = std::max(40960000 / nkeys / ncols, (int)1);  // Adjust me!
-  grid.z  = std::min(grid.z, (unsigned int)nrows);
-  grid.z  = std::min(grid.z, MAX_BLOCKS);
-
-  sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid, block, 0, st>>>(
-    d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+  uint32_t block_dim = 128;
+  auto grid_dim      = static_cast<uint32_t>(ceildiv<IdxT>(nrows * ncols, (IdxT)block_dim));
+  sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid_dim, block_dim, 0, st>>>(
+    d_A, lda, d_weights, d_keys, nrows, ncols, d_sums);
 }
 
 /**
@@ -354,6 +314,8 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A,
  *                       (may be a simple pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys
  *                       (may be a simple pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  *
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
@@ -365,26 +327,31 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A,
  * @param[in]  nkeys       Number of unique keys in d_keys
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
                         KeysIteratorT d_keys,
                         const WeightT* d_weights,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums)
 {
   typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
 
   // Following kernel needs memset
-  cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream);
+  if (reset_sums) { cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(SumsT), stream); }
 
-  if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
+  if (d_keys_char != nullptr && nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
     // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW
     // with doubles we have ~20% speed up - with floats we can hope something around 2x
     // Converting d_keys to char
@@ -392,12 +359,7 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
     sum_rows_by_key_small_nkeys(
       d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream);
   } else {
-    for (KeyType key_offset = 0; key_offset < static_cast<KeyType>(nkeys);
-         key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) {
-      KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys);
-      sum_rows_by_key_large_nkeys_rowmajor(
-        d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream);
-    }
+    sum_rows_by_key_large_nkeys_rowmajor(d_A, lda, d_keys, d_weights, nrows, ncols, d_sums, stream);
   }
 }
 
@@ -407,6 +369,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
  * pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
  * @param[in]  d_keys      Keys for each row (1 x nrows)
@@ -417,18 +381,19 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
  */
-template <typename DataIteratorT, typename KeysIteratorT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
+template <typename DataIteratorT, typename KeysIteratorT, typename SumsT, typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
                         KeysIteratorT d_keys,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums)
 {
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
   reduce_rows_by_key(d_A,
                      lda,
                      d_keys,
@@ -438,7 +403,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
                      ncols,
                      nkeys,
                      d_sums,
-                     stream);
+                     stream,
+                     reset_sums);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
index 2fdf9dceb9..5a810bf2ba 100644
--- a/cpp/include/raft/linalg/init.cuh
+++ b/cpp/include/raft/linalg/init.cuh
@@ -19,6 +19,7 @@
 #pragma once
 
 #include "detail/init.hpp"
+#include <raft/util/cudart_utils.hpp>
 
 namespace raft {
 namespace linalg {
@@ -54,6 +55,19 @@ void range(T* out, int n, cudaStream_t stream)
   detail::range(out, n, stream);
 }
 
+/**
+ * @brief Zeros the output.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T>
+void zero(T* out, int n, cudaStream_t stream)
+{
+  RAFT_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out), 0, n * sizeof(T), stream));
+}
+
 }  // namespace linalg
 }  // namespace raft
 
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 39c54e8b0c..1dabd92087 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -43,6 +43,8 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
  *                       (may be a simple pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys
  *                       (may be a simple pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  *
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
@@ -54,21 +56,27 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
  * @param[in]  nkeys       Number of unique keys in d_keys
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
                         const KeysIteratorT d_keys,
                         const WeightT* d_weights,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums = true)
 {
   detail::reduce_rows_by_key(
-    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream, reset_sums);
 }
 
 /**
@@ -77,6 +85,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
  * pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
  * @param[in]  d_keys      Keys for each row (1 x nrows)
@@ -86,19 +96,21 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * @param[in]  nkeys       Number of unique keys in d_keys
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
-template <typename DataIteratorT, typename KeysIteratorT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
-                        KeysIteratorT d_keys,
+template <typename DataIteratorT, typename KeysIteratorT, typename SumsT, typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
+                        const KeysIteratorT d_keys,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums = true)
 {
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
   reduce_rows_by_key(d_A,
                      lda,
                      d_keys,
@@ -108,7 +120,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
                      ncols,
                      nkeys,
                      d_sums,
-                     stream);
+                     stream,
+                     reset_sums);
 }
 
 /**
@@ -128,9 +141,10 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * @param[in]  d_keys      Keys for each row raft::device_vector_view (1 x nrows)
  * @param[out] d_sums      Row sums by key raft::device_matrix_view (ncols x d_keys)
  * @param[in]  n_unique_keys       Number of unique keys in d_keys
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view
  * @param[in]  d_weights   Weights for each observation in d_A raft::device_vector_view optional (1
  * x nrows)
- * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
 template <typename ElementType, typename KeyType, typename WeightType, typename IndexType>
 void reduce_rows_by_key(
@@ -140,7 +154,8 @@ void reduce_rows_by_key(
   raft::device_matrix_view<ElementType, IndexType, raft::row_major> d_sums,
   IndexType n_unique_keys,
   raft::device_vector_view<char, IndexType> d_keys_char,
-  std::optional<raft::device_vector_view<const WeightType, IndexType>> d_weights = std::nullopt)
+  std::optional<raft::device_vector_view<const WeightType, IndexType>> d_weights = std::nullopt,
+  bool reset_sums                                                                = true)
 {
   RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys,
                "Output is not of size ncols * n_unique_keys");
@@ -158,7 +173,8 @@ void reduce_rows_by_key(
                        d_A.extent(0),
                        n_unique_keys,
                        d_sums.data_handle(),
-                       handle.get_stream());
+                       handle.get_stream(),
+                       reset_sums);
   } else {
     reduce_rows_by_key(d_A.data_handle(),
                        d_A.extent(0),
@@ -168,7 +184,8 @@ void reduce_rows_by_key(
                        d_A.extent(0),
                        n_unique_keys,
                        d_sums.data_handle(),
-                       handle.get_stream());
+                       handle.get_stream(),
+                       reset_sums);
   }
 }
 
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 15f5204382..8180b88c8a 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/mdspan.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
@@ -176,6 +178,37 @@ struct Linewise {
       return out;
     }
   }
+
+  /**
+   * @brief Same as loadVec, but padds data with Ones
+   *
+   * @param shm
+   * @param p
+   * @param blockOffset
+   * @param rowLen
+   * @param rowLenPadded
+   * @return a contiguous chunk of a vector, suitable for `vectorRows`.
+   */
+  static __device__ __forceinline__ Vec loadVecPadded(Type* shm,
+                                                      const Type* p,
+                                                      const IdxType blockOffset,
+                                                      const IdxType rowLen,
+                                                      const IdxType rowLenPadded) noexcept
+  {
+    IdxType j = blockOffset + threadIdx.x;
+#pragma unroll VecElems
+    for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
+      while (j >= rowLenPadded)
+        j -= rowLenPadded;
+      shm[k] = j < rowLen ? p[j] : Type(1);
+    }
+    __syncthreads();
+    {
+      Vec out;
+      *out.vectorized_data() = reinterpret_cast<typename Vec::io_t*>(shm)[threadIdx.x];
+      return out;
+    }
+  }
 };
 
 /**
@@ -325,6 +358,48 @@ __global__ void __launch_bounds__(BlockSize)
     (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...);
 }
 
+/**
+ * Simplified version of `matrixLinewiseVecRowsMainKernel` for use with padded data.
+ * Data is required to be aligned and padded.
+ *
+ * @param [out] out the start of the *aligned* part of the output matrix
+ * @param [in] in the start of the *aligned* part of the input matrix
+ * @param [in] arrOffset such an offset into the matrices that makes them aligned to `VecBytes`
+ * @param [in] rowLen number of elements in a row (= the vector size)
+ * @param [in] len the total length of the aligned part of the matrices
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+__global__ void __launch_bounds__(BlockSize)
+  matrixLinewiseVecRowsSpanKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType rowLen,
+                                  const IdxType rowLenPadded,
+                                  const IdxType lenPadded,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
+  constexpr uint workSize = L::VecElems * BlockSize;
+  uint workOffset         = workSize;
+  __shared__ __align__(sizeof(Type) * L::VecElems)
+    Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  const IdxType blockOffset = (BlockSize * L::VecElems * blockIdx.x) % rowLenPadded;
+  return L::vectorRows(
+    reinterpret_cast<typename L::Vec::io_t*>(out),
+    reinterpret_cast<const typename L::Vec::io_t*>(in),
+    L::AlignElems::div(lenPadded),
+    op,
+    (workOffset ^= workSize,
+     L::loadVecPadded(shm + workOffset, vecs, blockOffset, rowLen, rowLenPadded))...);
+}
+
 /**
  * This kernel is similar to `matrixLinewiseVecRowsMainKernel`, but processes only the unaligned
  * head and tail parts of the matrix.
@@ -444,6 +519,59 @@ void matrixLinewiseVecCols(Type* out,
   }
 }
 
+/**
+ *  input/output data is expected to be aligned and padded
+ *  we simply extend the operation over the padded elements to be fully aligned
+ */
+template <typename Type,
+          typename IdxType,
+          typename LayoutPolicy,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecColsSpan(
+  raft::device_aligned_matrix_view<Type, IdxType, LayoutPolicy> out,
+  raft::device_aligned_matrix_view<const Type, IdxType, LayoutPolicy> in,
+  const IdxType rowLen,
+  const IdxType nRows,
+  Lambda op,
+  cudaStream_t stream,
+  Vecs... vecs)
+{
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+
+  typedef raft::Pow2<raft::layout_left_padded<Type>::padding> AlignPadding;
+
+  const uint paddedRowLen  = AlignPadding::roundUp(rowLen);
+  const IdxType alignedLen = paddedRowLen * nRows;
+
+  if (rowLen * nRows > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    // does not make sense to have more blocks than this
+    const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
+    const dim3 gs(std::min(maxBlocks, occupy), 1, 1);
+    // The work arrangement is blocked on the block and warp levels;
+    //   see more details at Linewise::vectorCols.
+    // The value below determines how many scalar elements are processed by on thread in total.
+    const IdxType elemsPerThread =
+      raft::ceildiv<IdxType>(alignedLen, gs.x * VecElems * BlockSize) * VecElems;
+    matrixLinewiseVecColsMainKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(out.data_handle(),
+                              in.data_handle(),
+                              0,
+                              paddedRowLen,
+                              alignedLen,
+                              elemsPerThread,
+                              op,
+                              vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
 template <typename Type,
           typename IdxType,
           std::size_t VecBytes,
@@ -508,6 +636,70 @@ void matrixLinewiseVecRows(Type* out,
   }
 }
 
+/**
+ *  input/output data is expected to be aligned and padded
+ *  we simply extend the operation over the padded elements to be fully aligned
+ *  special treatment for 'Vecs' is needed as no elements are available for the padded region
+ */
+template <typename Type,
+          typename IdxType,
+          typename LayoutPolicy,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecRowsSpan(
+  raft::device_aligned_matrix_view<Type, IdxType, LayoutPolicy> out,
+  raft::device_aligned_matrix_view<const Type, IdxType, LayoutPolicy> in,
+  const IdxType rowLen,
+  const IdxType nRows,
+  Lambda op,
+  cudaStream_t stream,
+  Vecs... vecs)
+{
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+  typedef raft::Pow2<VecBytes> AlignBytes;
+
+  typedef raft::Pow2<raft::layout_right_padded<Type>::padding> AlignPadding;
+
+  const uint paddedRowLen  = AlignPadding::roundUp(rowLen);
+  const IdxType alignedLen = paddedRowLen * nRows;
+
+  if (rowLen * nRows > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // The work arrangement is striped;
+    //   see more details at Linewise::vectorRows.
+    // Below is the work amount performed by one block in one iteration.
+    constexpr uint block_work_size = bs.x * uint(VecElems);
+    /* Here I would define `grid_work_size = lcm(block_work_size, rowLen)` (Least Common Multiple)
+       This way, the grid spans a set of one or more rows each iteration, and, most importantly,
+       on every iteration each row processes the same set of indices within a row (= the same set
+       of vector indices).
+       This means, each block needs to load the values from the vector arguments only once.
+       Sadly, sometimes `grid_work_size > rowLen*nRows`, and sometimes grid_work_size > UINT_MAX.
+       That's why I don't declare it here explicitly.
+       Instead, I straightaway compute the
+         expected_grid_size = lcm(block_work_size, rowLen) / block_work_size
+     */
+    const uint expected_grid_size = paddedRowLen / raft::gcd(block_work_size, uint(paddedRowLen));
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    const dim3 gs(std::min(
+                    // does not make sense to have more blocks than this
+                    raft::ceildiv<uint>(uint(alignedLen), block_work_size),
+                    // increase the grid size to be not less than `occupy` while
+                    // still being the multiple of `expected_grid_size`
+                    raft::ceildiv<uint>(occupy, expected_grid_size) * expected_grid_size),
+                  1,
+                  1);
+
+    matrixLinewiseVecRowsSpanKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(
+        out.data_handle(), in.data_handle(), rowLen, paddedRowLen, alignedLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
 /**
  * Select one of the implementations:
  *   a. vectors applied along/across lines
@@ -541,6 +733,47 @@ struct MatrixLinewiseOp {
       return matrixLinewiseVecCols<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>(
         out, in, lineLen, nLines, op, stream, vecs...);
   }
+
+  template <typename Type,
+            typename IdxType,
+            typename LayoutPolicy,
+            typename Lambda,
+            typename... Vecs>
+  static void runPadded(raft::device_aligned_matrix_view<Type, IdxType, LayoutPolicy> out,
+                        raft::device_aligned_matrix_view<const Type, IdxType, LayoutPolicy> in,
+                        const IdxType lineLen,
+                        const IdxType nLines,
+                        const bool alongLines,
+                        Lambda op,
+                        cudaStream_t stream,
+                        Vecs... vecs)
+  {
+    constexpr auto is_rowmajor = std::is_same_v<LayoutPolicy, raft::layout_right_padded<Type>>;
+    constexpr auto is_colmajor = std::is_same_v<LayoutPolicy, raft::layout_left_padded<Type>>;
+
+    static_assert(is_rowmajor || is_colmajor,
+                  "layout for in and out must be either padded row or col major");
+
+    // also statically assert padded matrix alignment == 2^i*VecBytes
+    assert(raft::Pow2<VecBytes>::areSameAlignOffsets(in, out));
+
+    if (alongLines)
+      return matrixLinewiseVecRowsSpan<Type,
+                                       IdxType,
+                                       LayoutPolicy,
+                                       VecBytes,
+                                       BlockSize,
+                                       Lambda,
+                                       Vecs...>(out, in, lineLen, nLines, op, stream, vecs...);
+    else
+      return matrixLinewiseVecColsSpan<Type,
+                                       IdxType,
+                                       LayoutPolicy,
+                                       VecBytes,
+                                       BlockSize,
+                                       Lambda,
+                                       Vecs...>(out, in, lineLen, nLines, op, stream, vecs...);
+  }
 };
 
 }  // end namespace detail
diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh
index 6b383b14f5..77f70239ea 100644
--- a/cpp/include/raft/matrix/linewise_op.cuh
+++ b/cpp/include/raft/matrix/linewise_op.cuh
@@ -42,9 +42,12 @@ namespace raft::matrix {
  * @param [in] alongLines whether vectors are indices along or across lines.
  * @param [in] op the operation applied on each line:
  *    for i in [0..lineLen) and j in [0..nLines):
+ *      out[j, i] = op(in[j, i], vec1[i], vec2[i], ... veck[i])   if alongLines = true
+ *      out[j, i] = op(in[j, i], vec1[j], vec2[j], ... veck[j])   if alongLines = false
+ *    where matrix indexing is row-major ([j, i] = [i + lineLen * j]).
  *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
  *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
- *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
+ *    where matrix indexing is col-major ([i, j] = [i + lineLen * j]).
  * @param [in] vecs zero or more vectors to be passed as arguments,
  *    size of each vector is `alongLines ? lineLen : nLines`.
  */
@@ -67,8 +70,8 @@ void linewise_op(const raft::handle_t& handle,
   static_assert(is_rowmajor || is_colmajor,
                 "layout for in and out must be either row or col major");
 
-  const idx_t lineLen = is_rowmajor ? in.extent(0) : in.extent(1);
-  const idx_t nLines  = is_rowmajor ? in.extent(1) : in.extent(0);
+  const idx_t nLines  = is_rowmajor ? in.extent(0) : in.extent(1);
+  const idx_t lineLen = is_rowmajor ? in.extent(1) : in.extent(0);
 
   RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1),
                "Input and output must have the same shape.");
@@ -82,4 +85,34 @@ void linewise_op(const raft::handle_t& handle,
                                                      handle.get_stream(),
                                                      vecs.data_handle()...);
 }
+
+template <typename m_t,
+          typename idx_t,
+          typename layout,
+          typename Lambda,
+          typename... vec_t,
+          typename = raft::enable_if_device_mdspan<vec_t...>>
+void linewise_op(const raft::handle_t& handle,
+                 raft::device_aligned_matrix_view<const m_t, idx_t, layout> in,
+                 raft::device_aligned_matrix_view<m_t, idx_t, layout> out,
+                 const bool alongLines,
+                 Lambda op,
+                 vec_t... vecs)
+{
+  constexpr auto is_rowmajor = std::is_same_v<layout, raft::layout_right_padded<m_t>>;
+  constexpr auto is_colmajor = std::is_same_v<layout, raft::layout_left_padded<m_t>>;
+
+  static_assert(is_rowmajor || is_colmajor,
+                "layout for in and out must be either padded row or col major");
+
+  const idx_t nLines  = is_rowmajor ? in.extent(0) : in.extent(1);
+  const idx_t lineLen = is_rowmajor ? in.extent(1) : in.extent(0);
+
+  RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1),
+               "Input and output must have the same shape.");
+
+  detail::MatrixLinewiseOp<16, 256>::runPadded<m_t, idx_t>(
+    out, in, lineLen, nLines, alongLines, op, handle.get_stream(), vecs.data_handle()...);
+}
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/neighbors/ann_types.hpp b/cpp/include/raft/neighbors/ann_types.hpp
new file mode 100644
index 0000000000..5c6fd52be9
--- /dev/null
+++ b/cpp/include/raft/neighbors/ann_types.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_types.hpp>
+
+namespace raft::neighbors::ann {
+
+/** The base for approximate KNN index structures. */
+struct index {
+};
+
+/** The base for KNN index parameters. */
+struct index_params {
+  /** Distance type. */
+  raft::distance::DistanceType metric = distance::DistanceType::L2Expanded;
+  /** The argument used by some distance metrics. */
+  float metric_arg = 2.0f;
+  /**
+   * Whether to add the dataset content to the index, i.e.:
+   *
+   *  - `true` means the index is filled with the dataset vectors and ready to search after calling
+   * `build`.
+   *  - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but
+   * the index is left empty; you'd need to call `extend` on the index afterwards to populate it.
+   */
+  bool add_data_on_build = true;
+};
+
+struct search_params {
+};
+
+};  // namespace raft::neighbors::ann
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
new file mode 100644
index 0000000000..28ff8491b6
--- /dev/null
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __BALL_COVER_H
+#define __BALL_COVER_H
+
+#pragma once
+
+#include <cstdint>
+
+#include "ball_cover_types.hpp"
+#include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/ball_cover.cuh>
+#include <raft/spatial/knn/detail/ball_cover/common.cuh>
+#include <thrust/transform.h>
+
+namespace raft::neighbors::ball_cover {
+
+/**
+ * Builds and populates a previously unbuilt BallCoverIndex
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::handle_t handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *  BallCoverIndex index(handle, X, metric);
+ *
+ *  ball_cover::build_index(handle, index);
+ * @endcode
+ *
+ * @tparam idx_t knn index type
+ * @tparam value_t knn value type
+ * @tparam int_t integral type for knn params
+ * @tparam matrix_idx_t matrix indexing type
+ * @param[in] handle library resource management handle
+ * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void build_index(const raft::handle_t& handle,
+                 BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
+{
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    raft::spatial::knn::detail::rbc_build_index(
+      handle, index, spatial::knn::detail::HaversineFunc<value_t, int_t>());
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    raft::spatial::knn::detail::rbc_build_index(
+      handle, index, spatial::knn::detail::EuclideanFunc<value_t, int_t>());
+  } else {
+    RAFT_FAIL("Metric not support");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ * @tparam idx_t knn index type
+ * @tparam value_t knn distance type
+ * @tparam int_t type for integers, such as number of rows/cols
+ * @param[in] handle raft handle for resource management
+ * @param[inout] index ball cover index which has not yet been built
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   int_t k,
+                   idx_t* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    raft::spatial::knn::detail::rbc_all_knn_query(
+      handle,
+      index,
+      k,
+      inds,
+      dists,
+      spatial::knn::detail::HaversineFunc<value_t, int_t>(),
+      perform_post_filtering,
+      weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    raft::spatial::knn::detail::rbc_all_knn_query(
+      handle,
+      index,
+      k,
+      inds,
+      dists,
+      spatial::knn::detail::EuclideanFunc<value_t, int_t>(),
+      perform_post_filtering,
+      weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::handle_t handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *
+ *  // Construct a ball cover index
+ *  BallCoverIndex index(handle, X, metric);
+ *
+ *  // Perform all neighbors knn query
+ *  ball_cover::all_knn_query(handle, index, inds, dists, k);
+ * @endcode
+ *
+ * @tparam idx_t knn index type
+ * @tparam value_t knn distance type
+ * @tparam int_t type for integers, such as number of rows/cols
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle raft handle for resource management
+ * @param[in] index ball cover index which has not yet been built
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+                   raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+                   int_t k,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  RAFT_EXPECTS(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  RAFT_EXPECTS(k <= index.m,
+               "k must be less than or equal to the number of data points in the index");
+  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<matrix_idx_t>(k),
+               "Number of columns in output indices and distances matrices must be equal to k");
+
+  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == index.get_X().extent(0),
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in index matrix.");
+
+  all_knn_query(
+    handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight);
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ * @tparam idx_t index type
+ * @tparam value_t distances type
+ * @tparam int_t integer type for size info
+ * @param[in] handle raft handle for resource management
+ * @param[inout] index ball cover index which has not yet been built
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] query the
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ * @param[in] n_query_pts number of query points
+ */
+template <typename idx_t, typename value_t, typename int_t>
+void knn_query(const raft::handle_t& handle,
+               const BallCoverIndex<idx_t, value_t, int_t>& index,
+               int_t k,
+               const value_t* query,
+               int_t n_query_pts,
+               idx_t* inds,
+               value_t* dists,
+               bool perform_post_filtering = true,
+               float weight                = 1.0)
+{
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    raft::spatial::knn::detail::rbc_knn_query(handle,
+                                              index,
+                                              k,
+                                              query,
+                                              n_query_pts,
+                                              inds,
+                                              dists,
+                                              spatial::knn::detail::HaversineFunc<value_t, int_t>(),
+                                              perform_post_filtering,
+                                              weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    raft::spatial::knn::detail::rbc_knn_query(handle,
+                                              index,
+                                              k,
+                                              query,
+                                              n_query_pts,
+                                              inds,
+                                              dists,
+                                              spatial::knn::detail::EuclideanFunc<value_t, int_t>(),
+                                              perform_post_filtering,
+                                              weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::handle_t handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *
+ *  // Build a ball cover index
+ *  BallCoverIndex index(handle, X, metric);
+ *  ball_cover::build_index(handle, index);
+ *
+ *  // Perform all neighbors knn query
+ *  ball_cover::knn_query(handle, index, inds, dists, k);
+ * @endcode
+
+ *
+ * @tparam idx_t index type
+ * @tparam value_t distances type
+ * @tparam int_t integer type for size info
+ * @tparam matrix_idx_t
+ * @param[in] handle raft handle for resource management
+ * @param[in] index ball cover index which has not yet been built
+ * @param[in] query device matrix containing query data points
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void knn_query(const raft::handle_t& handle,
+               const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+               raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
+               raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+               raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+               int_t k,
+               bool perform_post_filtering = true,
+               float weight                = 1.0)
+{
+  RAFT_EXPECTS(k <= index.m,
+               "k must be less than or equal to the number of data points in the index");
+  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<idx_t>(k),
+               "Number of columns in output indices and distances matrices must be equal to k");
+
+  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == query.extent(0),
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in search matrix.");
+
+  RAFT_EXPECTS(query.extent(1) == index.get_X().extent(1),
+               "Number of columns in query and index matrices must match.");
+
+  knn_query(handle,
+            index,
+            k,
+            query.data_handle(),
+            query.extent(0),
+            inds.data_handle(),
+            dists.data_handle(),
+            perform_post_filtering,
+            weight);
+}
+
+// TODO: implement functions for:
+//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
+//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
+
+}  // namespace raft::neighbors::ball_cover
+
+#endif
diff --git a/cpp/include/raft/neighbors/ball_cover_types.hpp b/cpp/include/raft/neighbors/ball_cover_types.hpp
new file mode 100644
index 0000000000..f6e49ab5c4
--- /dev/null
+++ b/cpp/include/raft/neighbors/ball_cover_types.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::neighbors::ball_cover {
+
+/**
+ * Stores raw index data points, sampled landmarks, the 1-nns of index points
+ * to their closest landmarks, and the ball radii of each landmark. This
+ * class is intended to be constructed once and reused across subsequent
+ * queries.
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ */
+template <typename value_idx,
+          typename value_t,
+          typename value_int  = std::uint32_t,
+          typename matrix_idx = std::uint32_t>
+class BallCoverIndex {
+ public:
+  explicit BallCoverIndex(const raft::handle_t& handle_,
+                          const value_t* X_,
+                          value_int m_,
+                          value_int n_,
+                          raft::distance::DistanceType metric_)
+    : handle(handle_),
+      X(raft::make_device_matrix_view<const value_t, matrix_idx>(X_, m_, n_)),
+      m(m_),
+      n(n_),
+      metric(metric_),
+      /**
+       * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
+       *
+       * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
+       */
+      n_landmarks(sqrt(m_)),
+      R_indptr(raft::make_device_vector<value_idx, matrix_idx>(handle, sqrt(m_) + 1)),
+      R_1nn_cols(raft::make_device_vector<value_idx, matrix_idx>(handle, m_)),
+      R_1nn_dists(raft::make_device_vector<value_t, matrix_idx>(handle, m_)),
+      R_closest_landmark_dists(raft::make_device_vector<value_t, matrix_idx>(handle, m_)),
+      R(raft::make_device_matrix<value_t, matrix_idx>(handle, sqrt(m_), n_)),
+      R_radius(raft::make_device_vector<value_t, matrix_idx>(handle, sqrt(m_))),
+      index_trained(false)
+  {
+  }
+
+  explicit BallCoverIndex(const raft::handle_t& handle_,
+                          raft::device_matrix_view<const value_t, matrix_idx, row_major> X_,
+                          raft::distance::DistanceType metric_)
+    : handle(handle_),
+      X(X_),
+      m(X_.extent(0)),
+      n(X_.extent(1)),
+      metric(metric_),
+      /**
+       * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
+       *
+       * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
+       */
+      n_landmarks(sqrt(X_.extent(0))),
+      R_indptr(raft::make_device_vector<value_idx, matrix_idx>(handle, sqrt(X_.extent(0)) + 1)),
+      R_1nn_cols(raft::make_device_vector<value_idx, matrix_idx>(handle, X_.extent(0))),
+      R_1nn_dists(raft::make_device_vector<value_t, matrix_idx>(handle, X_.extent(0))),
+      R_closest_landmark_dists(raft::make_device_vector<value_t, matrix_idx>(handle, X_.extent(0))),
+      R(raft::make_device_matrix<value_t, matrix_idx>(handle, sqrt(X_.extent(0)), X_.extent(1))),
+      R_radius(raft::make_device_vector<value_t, matrix_idx>(handle, sqrt(X_.extent(0)))),
+      index_trained(false)
+  {
+  }
+
+  auto get_R_indptr() const -> raft::device_vector_view<const value_idx, matrix_idx>
+  {
+    return R_indptr.view();
+  }
+  auto get_R_1nn_cols() const -> raft::device_vector_view<const value_idx, matrix_idx>
+  {
+    return R_1nn_cols.view();
+  }
+  auto get_R_1nn_dists() const -> raft::device_vector_view<const value_t, matrix_idx>
+  {
+    return R_1nn_dists.view();
+  }
+  auto get_R_radius() const -> raft::device_vector_view<const value_t, matrix_idx>
+  {
+    return R_radius.view();
+  }
+  auto get_R() const -> raft::device_matrix_view<const value_t, matrix_idx, row_major>
+  {
+    return R.view();
+  }
+  auto get_R_closest_landmark_dists() const -> raft::device_vector_view<const value_t, matrix_idx>
+  {
+    return R_closest_landmark_dists.view();
+  }
+
+  raft::device_vector_view<value_idx, matrix_idx> get_R_indptr() { return R_indptr.view(); }
+  raft::device_vector_view<value_idx, matrix_idx> get_R_1nn_cols() { return R_1nn_cols.view(); }
+  raft::device_vector_view<value_t, matrix_idx> get_R_1nn_dists() { return R_1nn_dists.view(); }
+  raft::device_vector_view<value_t, matrix_idx> get_R_radius() { return R_radius.view(); }
+  raft::device_matrix_view<value_t, matrix_idx, row_major> get_R() { return R.view(); }
+  raft::device_vector_view<value_t, matrix_idx> get_R_closest_landmark_dists()
+  {
+    return R_closest_landmark_dists.view();
+  }
+  raft::device_matrix_view<const value_t, matrix_idx, row_major> get_X() const { return X; }
+
+  raft::distance::DistanceType get_metric() const { return metric; }
+
+  value_int get_n_landmarks() const { return n_landmarks; }
+  bool is_index_trained() const { return index_trained; };
+
+  // This should only be set by internal functions
+  void set_index_trained() { index_trained = true; }
+
+  const raft::handle_t& handle;
+
+  value_int m;
+  value_int n;
+  value_int n_landmarks;
+
+  raft::device_matrix_view<const value_t, matrix_idx, row_major> X;
+
+  raft::distance::DistanceType metric;
+
+ private:
+  // CSR storing the neighborhoods for each data point
+  raft::device_vector<value_idx, matrix_idx> R_indptr;
+  raft::device_vector<value_idx, matrix_idx> R_1nn_cols;
+  raft::device_vector<value_t, matrix_idx> R_1nn_dists;
+  raft::device_vector<value_t, matrix_idx> R_closest_landmark_dists;
+
+  raft::device_vector<value_t, matrix_idx> R_radius;
+
+  raft::device_matrix<value_t, matrix_idx, row_major> R;
+
+ protected:
+  bool index_trained;
+};
+}  // namespace raft::neighbors::ball_cover
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
new file mode 100644
index 0000000000..772ccb67d2
--- /dev/null
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
+#include <raft/spatial/knn/detail/selection_faiss.cuh>
+
+namespace raft::neighbors::brute_force {
+
+/**
+ * @brief Performs a k-select across several (contiguous) row-partitioned index/distance
+ * matrices formatted like the following:
+ *
+ * part1row1: k0, k1, k2, k3
+ * part1row2: k0, k1, k2, k3
+ * part1row3: k0, k1, k2, k3
+ * part2row1: k0, k1, k2, k3
+ * part2row2: k0, k1, k2, k3
+ * part2row3: k0, k1, k2, k3
+ * etc...
+ *
+ * The example above shows what an aggregated index/distance matrix
+ * would look like with two partitions when n_samples=3 and k=4.
+ *
+ * When working with extremely large data sets that have been broken
+ * over multiple indexes, such as when computing over multiple GPUs,
+ * the ids will often start at 0 for each local knn index but the
+ * global ids need to be used when merging them together. An optional
+ * translations vector can be supplied to map the starting id of
+ * each partition to its global id so that the final merged knn
+ * is based on the global ids.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::handle_t handle;
+ *  ...
+ *  compute multiple knn graphs and aggregate row-wise
+ *  (see detailed description above)
+ *  ...
+ *  brute_force::knn_merge_parts(handle, in_keys, in_values, out_keys, out_values, n_samples);
+ * @endcode
+ *
+ * @tparam idx_t
+ * @tparam value_t
+ *
+ * @param[in] handle
+ * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k)
+ * @param[in] in_values matrix of input values (size n_samples * n_parts * k)
+ * @param[out] out_keys matrix of output keys (size n_samples * k)
+ * @param[out] out_values matrix of output values (size n_samples * k)
+ * @param[in] n_samples number of rows in each partition
+ * @param[in] translations optional vector of starting global id mappings for each local partition
+ */
+template <typename idx_t, typename value_t>
+inline void knn_merge_parts(
+  const raft::handle_t& handle,
+  raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
+  raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
+  raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
+  raft::device_matrix_view<idx_t, idx_t, row_major> out_values,
+  size_t n_samples,
+  std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt)
+{
+  RAFT_EXPECTS(in_keys.extent(1) == in_values.extent(1) && in_keys.extent(0) == in_values.extent(0),
+               "in_keys and in_values must have the same shape.");
+  RAFT_EXPECTS(
+    out_keys.extent(0) == out_values.extent(0) == n_samples,
+    "Number of rows in output keys and val matrices must equal number of rows in search matrix.");
+  RAFT_EXPECTS(out_keys.extent(1) == out_values.extent(1) == in_keys.extent(1),
+               "Number of columns in output indices and distances matrices must be equal to k");
+
+  auto n_parts = in_keys.extent(0) / n_samples;
+  spatial::knn::detail::knn_merge_parts(in_keys.data_handle(),
+                                        in_values.data_handle(),
+                                        out_keys.data_handle(),
+                                        out_values.data_handle(),
+                                        n_samples,
+                                        n_parts,
+                                        in_keys.extent(1),
+                                        handle.get_stream(),
+                                        translations.value_or(nullptr));
+}
+
+/**
+ * @brief Flat C++ API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances. Inputs can be either
+ * row- or column-major but the output matrices will always be in
+ * row-major format.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::handle_t handle;
+ *  ...
+ *  int k = 10;
+ *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ *  brute_force::knn(handle, index, search, indices, distances, k, metric);
+ * @endcode
+ *
+ * @param[in] handle: the cuml handle to use
+ * @param[in] index: vector of device matrices (each size m_i*d) to be used as the knn index
+ * @param[in] search: matrix (size n*d) to be used for searching the index
+ * @param[out] indices: matrix (size n*k) to store output knn indices
+ * @param[out] distances: matrix (size n*k) to store the output knn distance
+ * @param[in] k: the number of nearest neighbors to return
+ * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
+ * 					 is ignored if the metric_type is not Minkowski.
+ * @param[in] global_id_offset: optional starting global id mapping for the local partition
+ *                              (assumes the index contains contiguous ids in the global id space)
+ */
+template <typename idx_t,
+          typename value_t,
+          typename value_int,
+          typename matrix_idx,
+          typename index_layout,
+          typename search_layout>
+void knn(raft::handle_t const& handle,
+         std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
+         raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
+         raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+         raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
+         value_int k,
+         distance::DistanceType metric         = distance::DistanceType::L2Unexpanded,
+         std::optional<float> metric_arg       = std::make_optional<float>(2.0f),
+         std::optional<idx_t> global_id_offset = std::nullopt)
+{
+  RAFT_EXPECTS(index[0].extent(1) == search.extent(1),
+               "Number of dimensions for both index and search matrices must be equal");
+
+  RAFT_EXPECTS(indices.extent(0) == distances.extent(0) && distances.extent(0) == search.extent(0),
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in search matrix.");
+  RAFT_EXPECTS(
+    indices.extent(1) == distances.extent(1) && distances.extent(1) == static_cast<matrix_idx>(k),
+    "Number of columns in output indices and distances matrices must be equal to k");
+
+  bool rowMajorIndex = std::is_same_v<index_layout, layout_c_contiguous>;
+  bool rowMajorQuery = std::is_same_v<search_layout, layout_c_contiguous>;
+
+  std::vector<value_t*> inputs;
+  std::vector<value_int> sizes;
+  for (std::size_t i = 0; i < index.size(); ++i) {
+    inputs.push_back(const_cast<value_t*>(index[i].data_handle()));
+    sizes.push_back(index[i].extent(0));
+  }
+
+  std::vector<idx_t> trans;
+  if (global_id_offset.has_value()) { trans.push_back(global_id_offset.value()); }
+
+  std::vector<idx_t>* trans_arg = global_id_offset.has_value() ? &trans : nullptr;
+
+  raft::spatial::knn::detail::brute_force_knn_impl(handle,
+                                                   inputs,
+                                                   sizes,
+                                                   static_cast<value_int>(index[0].extent(1)),
+                                                   // TODO: This is unfortunate. Need to fix.
+                                                   const_cast<value_t*>(search.data_handle()),
+                                                   static_cast<value_int>(search.extent(0)),
+                                                   indices.data_handle(),
+                                                   distances.data_handle(),
+                                                   k,
+                                                   rowMajorIndex,
+                                                   rowMajorQuery,
+                                                   trans_arg,
+                                                   metric,
+                                                   metric_arg.value_or(2.0f));
+}
+
+}  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
new file mode 100644
index 0000000000..114216fc50
--- /dev/null
+++ b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __EPSILON_NEIGH_H
+#define __EPSILON_NEIGH_H
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+
+namespace raft::neighbors::epsilon_neighborhood {
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam value_t   IO and math type
+ * @tparam idx_t    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  m      number of rows in x
+ * @param[in]  n      number of rows in y
+ * @param[in]  k      number of columns in x and k
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  stream cuda stream
+ */
+template <typename value_t, typename idx_t>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              idx_t* vd,
+                              const value_t* x,
+                              const value_t* y,
+                              idx_t m,
+                              idx_t n,
+                              idx_t k,
+                              value_t eps,
+                              cudaStream_t stream)
+{
+  spatial::knn::detail::epsUnexpL2SqNeighborhood<value_t, idx_t>(
+    adj, vd, x, y, m, n, k, eps, stream);
+}
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric and given ball size.
+ * The epsilon neighbors is represented by a dense boolean adjacency matrix of size m * n and
+ * an array of degrees for each vertex, which can be used as a compressed sparse row (CSR)
+ * indptr array.
+ *
+ * @code{.cpp}
+ *  #include <raft/neighbors/epsilon_neighborhood.cuh>
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_mdarray.hpp>
+ *  using namespace raft::neighbors;
+ *  raft::handle_t handle;
+ *  ...
+ *  auto adj = raft::make_device_matrix<bool>(handle, m * n);
+ *  auto vd = raft::make_device_vector<int>(handle, m+1);
+ *  epsilon_neighborhood::eps_neighbors_l2sq(handle, x, y, adj.view(), vd.view(), eps);
+ * @endcode
+ *
+ * @tparam value_t   IO and math type
+ * @tparam idx_t    Index type
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in]  handle raft handle to manage library resources
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ */
+template <typename value_t, typename idx_t, typename matrix_idx_t>
+void eps_neighbors_l2sq(const raft::handle_t& handle,
+                        raft::device_matrix_view<const value_t, matrix_idx_t, row_major> x,
+                        raft::device_matrix_view<const value_t, matrix_idx_t, row_major> y,
+                        raft::device_matrix_view<bool, matrix_idx_t, row_major> adj,
+                        raft::device_vector_view<idx_t, matrix_idx_t> vd,
+                        value_t eps)
+{
+  epsUnexpL2SqNeighborhood<value_t, idx_t>(adj.data_handle(),
+                                           vd.data_handle(),
+                                           x.data_handle(),
+                                           y.data_handle(),
+                                           x.extent(0),
+                                           y.extent(0),
+                                           x.extent(1),
+                                           eps,
+                                           handle.get_stream());
+}
+
+}  // namespace raft::neighbors::epsilon_neighborhood
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
new file mode 100644
index 0000000000..5317f406e1
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat.cuh
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ivf_flat_types.hpp"
+#include <raft/spatial/knn/detail/ivf_flat_build.cuh>
+#include <raft/spatial/knn/detail/ivf_flat_search.cuh>
+
+#include <raft/core/handle.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::neighbors::ivf_flat {
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param[in] n_rows the number of samples
+ * @param[in] dim the dimensionality of the data
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto build(
+  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
+  -> index<T, IdxT>
+{
+  return raft::spatial::knn::ivf_flat::detail::build(handle, params, dataset, n_rows, dim);
+}
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, dataset, index_params);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
+ * @endcode
+ *
+ * @tparam value_t data element type
+ * @tparam idx_t type of the indices in the source dataset
+ * @tparam int_t precision / type of integral arguments
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename value_t, typename idx_t>
+auto build(const handle_t& handle,
+           raft::device_matrix_view<const value_t, idx_t, row_major> dataset,
+           const index_params& params) -> index<value_t, idx_t>
+{
+  return raft::spatial::knn::ivf_flat::detail::build(handle,
+                                                     params,
+                                                     dataset.data_handle(),
+                                                     static_cast<idx_t>(dataset.extent(0)),
+                                                     static_cast<idx_t>(dataset.extent(1)));
+}
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] orig_index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows number of rows in `new_vectors`
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto extend(const handle_t& handle,
+            const index<T, IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<T, IdxT>
+{
+  return raft::spatial::knn::ivf_flat::detail::extend(
+    handle, orig_index, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset);
+ * @endcode
+ *
+ * @tparam value_t data element type
+ * @tparam idx_t type of the indices in the source dataset
+ * @tparam int_t precision / type of integral arguments
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle
+ * @param[in] orig_index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename value_t, typename idx_t>
+auto extend(const handle_t& handle,
+            const index<value_t, idx_t>& orig_index,
+            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
+  -> index<value_t, idx_t>
+{
+  return extend<value_t, idx_t>(
+    handle,
+    orig_index,
+    new_vectors.data_handle(),
+    new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+    new_vectors.extent(0));
+}
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param[inout] index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ */
+template <typename T, typename IdxT>
+void extend(const handle_t& handle,
+            index<T, IdxT>* index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows)
+{
+  *index = extend(handle, *index, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   ivf_flat::extend(handle, index_empty, dataset);
+ * @endcode
+ *
+ * @tparam value_t data element type
+ * @tparam idx_t type of the indices in the source dataset
+ * @tparam int_t precision / type of integral arguments
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle
+ * @param[inout] index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ */
+template <typename value_t, typename idx_t>
+void extend(const handle_t& handle,
+            index<value_t, idx_t>* index,
+            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
+{
+  *index = extend(handle,
+                  *index,
+                  new_vectors.data_handle(),
+                  new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                  static_cast<idx_t>(new_vectors.extent(0)));
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // Create a pooling memory resource with a pre-defined initial size.
+ *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
+ *     rmm::mr::get_current_device_resource(), 1024 * 1024);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
+ *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
+ *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ...
+ * @endcode
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[in] n_queries the batch size
+ * @param[in] k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param[in] mr an optional memory resource to use across the searches (you can provide a large
+ * enough memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+void search(const handle_t& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr)
+{
+  return raft::spatial::knn::ivf_flat::detail::search(
+    handle, params, index, queries, n_queries, k, neighbors, distances, mr);
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_flat::search(handle, index, queries1, out_inds1, out_dists1, search_params, K);
+ *   ivf_flat::search(handle, index, queries2, out_inds2, out_dists2, search_params, K);
+ *   ivf_flat::search(handle, index, queries3, out_inds3, out_dists3, search_params, K);
+ *   ...
+ * @endcode
+ *
+ * @tparam value_t data element type
+ * @tparam idx_t type of the indices
+ * @tparam int_t precision / type of integral arguments
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle
+ * @param[in] index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param[in] params configure the search
+ * @param[in] k the number of neighbors to find for each query.
+ */
+template <typename value_t, typename idx_t, typename int_t>
+void search(const handle_t& handle,
+            const index<value_t, idx_t>& index,
+            raft::device_matrix_view<const value_t, idx_t, row_major> queries,
+            raft::device_matrix_view<idx_t, idx_t, row_major> neighbors,
+            raft::device_matrix_view<float, idx_t, row_major> distances,
+            const search_params& params,
+            int_t k)
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(
+    neighbors.extent(1) == distances.extent(1) && neighbors.extent(1) == static_cast<idx_t>(k),
+    "Number of columns in output neighbors and distances matrices must equal k");
+
+  RAFT_EXPECTS(queries.extent(1) == index.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  return search(handle,
+                params,
+                index,
+                queries.data_handle(),
+                static_cast<std::uint32_t>(queries.extent(0)),
+                static_cast<std::uint32_t>(k),
+                neighbors.data_handle(),
+                distances.data_handle(),
+                nullptr);
+}
+
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
new file mode 100644
index 0000000000..c7e3798f5d
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <optional>
+
+namespace raft::neighbors::ivf_flat {
+
+/** Size of the interleaved group (see `index::data` description). */
+constexpr static uint32_t kIndexGroupSize = 32;
+
+struct index_params : ann::index_params {
+  /** The number of inverted lists (clusters) */
+  uint32_t n_lists = 1024;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+};
+
+struct search_params : ann::search_params {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
+};
+
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
+
+/**
+ * @brief IVF-flat index.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ */
+template <typename T, typename IdxT>
+struct index : ann::index {
+  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
+                "IdxT must be able to represent all values of uint32_t");
+
+ public:
+  /**
+   * Vectorized load/store size in elements, determines the size of interleaved data chunks.
+   *
+   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
+   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
+   */
+  [[nodiscard]] constexpr inline auto veclen() const noexcept -> uint32_t { return veclen_; }
+  /** Distance metric used for clustering. */
+  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
+  {
+    return metric_;
+  }
+  /**
+   * Inverted list data [size, dim].
+   *
+   * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
+   * Within each list (cluster), the data is grouped into blocks of `kIndexGroupSize` interleaved
+   * vectors. Note, the total index length is slightly larger than the source dataset length,
+   * because each cluster is padded by `kIndexGroupSize` elements.
+   *
+   * Interleaving pattern:
+   * within groups of `kIndexGroupSize` rows, the data is interleaved with the block size equal to
+   * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
+   * followed by a chunk of the same size of the next row, and so on.
+   *
+   * __Example__: veclen = 2, dim = 6, kIndexGroupSize = 32, list_size = 31
+   *
+   *     x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
+   *     x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
+   *     x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
+   *     x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
+   *     x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
+   *     x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
+   *
+   */
+  inline auto data() noexcept -> device_mdspan<T, extent_2d<IdxT>, row_major>
+  {
+    return data_.view();
+  }
+  [[nodiscard]] inline auto data() const noexcept
+    -> device_mdspan<const T, extent_2d<size_t>, row_major>
+  {
+    return data_.view();
+  }
+
+  /** Inverted list indices: ids of items in the source data [size] */
+  inline auto indices() noexcept -> device_mdspan<IdxT, extent_1d<IdxT>, row_major>
+  {
+    return indices_.view();
+  }
+  [[nodiscard]] inline auto indices() const noexcept
+    -> device_mdspan<const IdxT, extent_1d<IdxT>, row_major>
+  {
+    return indices_.view();
+  }
+
+  /** Sizes of the lists (clusters) [n_lists] */
+  inline auto list_sizes() noexcept -> device_mdspan<uint32_t, extent_1d<uint32_t>, row_major>
+  {
+    return list_sizes_.view();
+  }
+  [[nodiscard]] inline auto list_sizes() const noexcept
+    -> device_mdspan<const uint32_t, extent_1d<uint32_t>, row_major>
+  {
+    return list_sizes_.view();
+  }
+
+  /**
+   * Offsets into the lists [n_lists + 1].
+   * The last value contains the total length of the index.
+   */
+  inline auto list_offsets() noexcept -> device_mdspan<IdxT, extent_1d<uint32_t>, row_major>
+  {
+    return list_offsets_.view();
+  }
+  [[nodiscard]] inline auto list_offsets() const noexcept
+    -> device_mdspan<const IdxT, extent_1d<uint32_t>, row_major>
+  {
+    return list_offsets_.view();
+  }
+
+  /** k-means cluster centers corresponding to the lists [n_lists, dim] */
+  inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
+  {
+    return centers_.view();
+  }
+  [[nodiscard]] inline auto centers() const noexcept
+    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
+  {
+    return centers_.view();
+  }
+
+  /**
+   * (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists].
+   *
+   * NB: this may be empty if the index is empty or if the metric does not require the center norms
+   * calculation.
+   */
+  inline auto center_norms() noexcept
+    -> std::optional<device_mdspan<float, extent_1d<uint32_t>, row_major>>
+  {
+    if (center_norms_.has_value()) {
+      return std::make_optional<device_mdspan<float, extent_1d<uint32_t>, row_major>>(
+        center_norms_->view());
+    } else {
+      return std::nullopt;
+    }
+  }
+  [[nodiscard]] inline auto center_norms() const noexcept
+    -> std::optional<device_mdspan<const float, extent_1d<uint32_t>, row_major>>
+  {
+    if (center_norms_.has_value()) {
+      return std::make_optional<device_mdspan<const float, extent_1d<uint32_t>, row_major>>(
+        center_norms_->view());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  /** Total length of the index. */
+  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
+  /** Dimensionality of the data. */
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
+  {
+    return centers_.extent(1);
+  }
+  /** Number of clusters/inverted lists. */
+  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t
+  {
+    return centers_.extent(0);
+  }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&) = delete;
+  index(index&&)      = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index& = default;
+  ~index()                          = default;
+
+  /** Construct an empty index. It needs to be trained and then populated. */
+  index(const handle_t& handle, raft::distance::DistanceType metric, uint32_t n_lists, uint32_t dim)
+    : ann::index(),
+      veclen_(calculate_veclen(dim)),
+      metric_(metric),
+      data_(make_device_mdarray<T>(handle, make_extents<IdxT>(0, dim))),
+      indices_(make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))),
+      list_sizes_(make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(n_lists))),
+      list_offsets_(make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(n_lists + 1))),
+      centers_(make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists, dim))),
+      center_norms_(std::nullopt)
+  {
+    check_consistency();
+  }
+
+  /** Construct an empty index. It needs to be trained and then populated. */
+  index(const handle_t& handle, const index_params& params, uint32_t dim)
+    : index(handle, params.metric, params.n_lists, dim)
+  {
+  }
+
+  /**
+   * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
+   * of data.
+   */
+  void allocate(const handle_t& handle, IdxT index_size, bool allocate_center_norms)
+  {
+    data_    = make_device_mdarray<T>(handle, make_extents<IdxT>(index_size, dim()));
+    indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    center_norms_ =
+      allocate_center_norms
+        ? std::optional(make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists())))
+        : std::nullopt;
+    check_consistency();
+  }
+
+ private:
+  /**
+   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
+   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
+   */
+  uint32_t veclen_;
+  raft::distance::DistanceType metric_;
+  device_mdarray<T, extent_2d<IdxT>, row_major> data_;
+  device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
+  device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
+  device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
+  device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
+  std::optional<device_mdarray<float, extent_1d<uint32_t>, row_major>> center_norms_;
+
+  /** Throw an error if the index content is inconsistent. */
+  void check_consistency()
+  {
+    RAFT_EXPECTS(dim() % veclen_ == 0, "dimensionality is not a multiple of the veclen");
+    RAFT_EXPECTS(data_.extent(0) == indices_.extent(0), "inconsistent index size");
+    RAFT_EXPECTS(data_.extent(1) == IdxT(centers_.extent(1)), "inconsistent data dimensionality");
+    RAFT_EXPECTS(                                               //
+      (centers_.extent(0) == list_sizes_.extent(0)) &&          //
+        (centers_.extent(0) + 1 == list_offsets_.extent(0)) &&  //
+        (!center_norms_.has_value() || centers_.extent(0) == center_norms_->extent(0)),
+      "inconsistent number of lists (clusters)");
+    RAFT_EXPECTS(reinterpret_cast<size_t>(data_.data_handle()) % (veclen_ * sizeof(T)) == 0,
+                 "The data storage pointer is not aligned to the vector length");
+  }
+
+  static auto calculate_veclen(uint32_t dim) -> uint32_t
+  {
+    // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+    // template parameter (https://github.com/rapidsai/raft/issues/711)
+    uint32_t veclen = 16 / sizeof(T);
+    while (dim % veclen != 0) {
+      veclen = veclen >> 1;
+    }
+    return veclen;
+  }
+};
+
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
new file mode 100644
index 0000000000..207e298947
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ivf_pq_types.hpp"
+#include <raft/spatial/knn/detail/ivf_pq_build.cuh>
+#include <raft/spatial/knn/detail/ivf_pq_search.cuh>
+
+#include <raft/core/handle.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::neighbors::ivf_pq {
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param n_rows the number of samples
+ * @param dim the dimensionality of the data
+ *
+ * @return the constructed ivf-pq index
+ */
+template <typename T, typename IdxT = uint32_t>
+inline auto build(
+  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
+  -> index<IdxT>
+{
+  return raft::spatial::knn::ivf_pq::detail::build(handle, params, dataset, n_rows, dim);
+}
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are unchanged.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_pq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param orig_index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param n_rows the number of samples
+ *
+ * @return the constructed extended ivf-pq index
+ */
+template <typename T, typename IdxT>
+inline auto extend(const handle_t& handle,
+                   const index<IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows) -> index<IdxT>
+{
+  return raft::spatial::knn::ivf_pq::detail::extend(
+    handle, orig_index, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param[inout] index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param n_rows the number of samples
+ */
+template <typename T, typename IdxT>
+inline void extend(const handle_t& handle,
+                   index<IdxT>* index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows)
+{
+  *index = extend(handle, *index, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // Create a pooling memory resource with a pre-defined initial size.
+ *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
+ *     rmm::mr::get_current_device_resource(), 1024 * 1024);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ...
+ * @endcode
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param handle
+ * @param params configure the search
+ * @param index ivf-pq constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param n_queries the batch size
+ * @param k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param mr an optional memory resource to use across the searches (you can provide a large enough
+ *           memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+inline void search(const handle_t& handle,
+                   const search_params& params,
+                   const index<IdxT>& index,
+                   const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  return raft::spatial::knn::ivf_pq::detail::search(
+    handle, params, index, queries, n_queries, k, neighbors, distances, mr);
+}
+
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
new file mode 100644
index 0000000000..3dbf004e95
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/error.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <type_traits>
+
+namespace raft::neighbors::ivf_pq {
+
+/** A type for specifying how PQ codebooks are created. */
+enum class codebook_gen {  // NOLINT
+  PER_SUBSPACE = 0,        // NOLINT
+  PER_CLUSTER  = 1,        // NOLINT
+};
+
+struct index_params : ann::index_params {
+  /**
+   * The number of inverted lists (clusters)
+   *
+   * Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
+   * 10,000.
+   */
+  uint32_t n_lists = 1024;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+  /**
+   * The bit length of the vector element after compression by PQ.
+   *
+   * Possible values: [4, 5, 6, 7, 8].
+   *
+   * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
+   * performance, but the lower the recall.
+   */
+  uint32_t pq_bits = 8;
+  /**
+   * The dimensionality of the vector after compression by PQ. When zero, an optimal value is
+   * selected using a heuristic.
+   *
+   * NB: `pq_dim * pq_bits` must be a multiple of 8.
+   *
+   * Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
+   * lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
+   * desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
+   * For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
+   * should be also a divisor of the dataset dim.
+   */
+  uint32_t pq_dim = 0;
+  /** How PQ codebooks are created. */
+  codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+  /**
+   * Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
+   *
+   * Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
+   * data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
+   * larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+   * However, this transform is not necessary when `dim` is multiple of `pq_dim`
+   *   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
+   *
+   * By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
+   * matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
+   * regardless of the values of `dim` and `pq_dim`.
+   */
+  bool force_random_rotation = false;
+};
+
+struct search_params : ann::search_params {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
+  /**
+   * Data type of look up table to be created dynamically at search time.
+   *
+   * Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+   *
+   * The use of low-precision types reduces the amount of shared memory required at search time, so
+   * fast shared memory kernels can be used even for datasets with large dimansionality. Note that
+   * the recall is slightly degraded when low-precision type is selected.
+   */
+  cudaDataType_t lut_dtype = CUDA_R_32F;
+  /**
+   * Storage data type for distance/similarity computed at search time.
+   *
+   * Possible values: [CUDA_R_16F, CUDA_R_32F]
+   *
+   * If the performance limiter at search time is device memory access, selecting FP16 will improve
+   * performance slightly.
+   */
+  cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+  /**
+   * Thread block size of the distance calculation kernel at search time.
+   * When zero, an optimal block size is selected using a heuristic.
+   *
+   * Possible values: [0, 256, 512, 1024]
+   */
+  uint32_t preferred_thread_block_size = 0;
+};
+
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
+
+/**
+ * @brief IVF-PQ index.
+ *
+ * In the IVF-PQ index, a database vector y is approximated with two level quantization:
+ *
+ * y = Q_1(y) + Q_2(y - Q_1(y))
+ *
+ * The first level quantizer (Q_1), maps the vector y to the nearest cluster center. The number of
+ * clusters is n_lists.
+ *
+ * The second quantizer encodes the residual, and it is defined as a product quantizer [1].
+ *
+ * A product quantizer encodes a `dim` dimensional vector with a `pq_dim` dimensional vector.
+ * First we split the input vector into `pq_dim` subvectors (denoted by u), where each u vector
+ * contains `pq_len` distinct components of y
+ *
+ * y_1, y_2, ... y_{pq_len}, y_{pq_len+1}, ... y_{2*pq_len}, ... y_{dim-pq_len+1} ... y_{dim}
+ *  \___________________/     \____________________________/      \______________________/
+ *         u_1                         u_2                          u_{pq_dim}
+ *
+ * Then each subvector encoded with a separate quantizer q_i, end the results are concatenated
+ *
+ * Q_2(y) = q_1(u_1),q_2(u_2),...,q_{pq_dim}(u_pq_dim})
+ *
+ * Each quantizer q_i outputs a code with pq_bit bits. The second level quantizers are also defined
+ * by k-means clustering in the corresponding sub-space: the reproduction values are the centroids,
+ * and the set of reproduction values is the codebook.
+ *
+ * When the data dimensionality `dim` is not multiple of `pq_dim`, the feature space is transformed
+ * using a random orthogonal matrix to have `rot_dim = pq_dim * pq_len` dimensions
+ * (`rot_dim >= dim`).
+ *
+ * The second-level quantizers are trained either for each subspace or for each cluster:
+ *   (a) codebook_gen::PER_SUBSPACE:
+ *         creates `pq_dim` second-level quantizers - one for each slice of the data along features;
+ *   (b) codebook_gen::PER_CLUSTER:
+ *         creates `n_lists` second-level quantizers - one for each first-level cluster.
+ * In either case, the centroids are again found using k-means clustering interpreting the data as
+ * having pq_len dimensions.
+ *
+ * [1] Product quantization for nearest neighbor search Herve Jegou, Matthijs Douze, Cordelia Schmid
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ */
+template <typename IdxT>
+struct index : ann::index {
+  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
+                "IdxT must be able to represent all values of uint32_t");
+
+ public:
+  /** Total length of the index. */
+  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
+  /** Dimensionality of the input data. */
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { return dim_; }
+  /**
+   * Dimensionality of the cluster centers:
+   * input data dim extended with vector norms and padded to 8 elems.
+   */
+  [[nodiscard]] constexpr inline auto dim_ext() const noexcept -> uint32_t
+  {
+    return raft::round_up_safe(dim() + 1, 8u);
+  }
+  /**
+   * Dimensionality of the data after transforming it for PQ processing
+   * (rotated and augmented to be muplitple of `pq_dim`).
+   */
+  [[nodiscard]] constexpr inline auto rot_dim() const noexcept -> uint32_t
+  {
+    return pq_len() * pq_dim();
+  }
+  /** The bit length of an encoded vector element after compression by PQ. */
+  [[nodiscard]] constexpr inline auto pq_bits() const noexcept -> uint32_t { return pq_bits_; }
+  /** The dimensionality of an encoded vector after compression by PQ. */
+  [[nodiscard]] constexpr inline auto pq_dim() const noexcept -> uint32_t { return pq_dim_; }
+  /** Dimensionality of a subspaces, i.e. the number of vector components mapped to a subspace */
+  [[nodiscard]] constexpr inline auto pq_len() const noexcept -> uint32_t
+  {
+    return raft::div_rounding_up_unsafe(dim(), pq_dim());
+  }
+  /** The number of vectors in a PQ codebook (`1 << pq_bits`). */
+  [[nodiscard]] constexpr inline auto pq_book_size() const noexcept -> uint32_t
+  {
+    return 1 << pq_bits();
+  }
+  /** Distance metric used for clustering. */
+  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
+  {
+    return metric_;
+  }
+  /** How PQ codebooks are created. */
+  [[nodiscard]] constexpr inline auto codebook_kind() const noexcept -> codebook_gen
+  {
+    return codebook_kind_;
+  }
+  /** Number of clusters/inverted lists (first level quantization). */
+  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t { return n_lists_; }
+  /** Number of non-empty clusters/inverted lists. */
+  [[nodiscard]] constexpr inline auto n_nonempty_lists() const noexcept -> uint32_t
+  {
+    return n_nonempty_lists_;
+  }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&) = delete;
+  index(index&&)      = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index& = default;
+  ~index()                          = default;
+
+  /** Construct an empty index. It needs to be trained and then populated. */
+  index(const handle_t& handle,
+        raft::distance::DistanceType metric,
+        codebook_gen codebook_kind,
+        uint32_t n_lists,
+        uint32_t dim,
+        uint32_t pq_bits          = 8,
+        uint32_t pq_dim           = 0,
+        uint32_t n_nonempty_lists = 0)
+    : ann::index(),
+      metric_(metric),
+      codebook_kind_(codebook_kind),
+      n_lists_(n_lists),
+      dim_(dim),
+      pq_bits_(pq_bits),
+      pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim),
+      n_nonempty_lists_(n_nonempty_lists),
+      pq_centers_{make_device_mdarray<float>(handle, make_pq_centers_extents())},
+      pq_dataset_{make_device_mdarray<uint8_t>(
+        handle, make_extents<IdxT>(0, this->pq_dim() * this->pq_bits() / 8))},
+      indices_{make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))},
+      rotation_matrix_{
+        make_device_mdarray<float>(handle, make_extents<uint32_t>(this->rot_dim(), this->dim()))},
+      list_offsets_{make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(this->n_lists() + 1))},
+      centers_{make_device_mdarray<float>(
+        handle, make_extents<uint32_t>(this->n_lists(), this->dim_ext()))},
+      centers_rot_{make_device_mdarray<float>(
+        handle, make_extents<uint32_t>(this->n_lists(), this->rot_dim()))}
+  {
+    check_consistency();
+  }
+
+  /** Construct an empty index. It needs to be trained and then populated. */
+  index(const handle_t& handle,
+        const index_params& params,
+        uint32_t dim,
+        uint32_t n_nonempty_lists = 0)
+    : index(handle,
+            params.metric,
+            params.codebook_kind,
+            params.n_lists,
+            dim,
+            params.pq_bits,
+            params.pq_dim,
+            n_nonempty_lists)
+  {
+  }
+
+  /**
+   * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
+   * of data.
+   */
+  void allocate(const handle_t& handle, IdxT index_size)
+  {
+    pq_dataset_ =
+      make_device_mdarray<uint8_t>(handle, make_extents<IdxT>(index_size, pq_dataset_.extent(1)));
+    indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    check_consistency();
+  }
+
+  /**
+   * PQ cluster centers
+   *
+   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_book_size, pq_len]
+   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_book_size, pq_len]
+   */
+  inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
+  {
+    return pq_centers_.view();
+  }
+  [[nodiscard]] inline auto pq_centers() const noexcept
+    -> device_mdspan<const float, extent_3d<uint32_t>, row_major>
+  {
+    return pq_centers_.view();
+  }
+
+  /** PQ-encoded data [size, pq_dim * pq_bits / 8]. */
+  inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, extent_2d<IdxT>, row_major>
+  {
+    return pq_dataset_.view();
+  }
+  [[nodiscard]] inline auto pq_dataset() const noexcept
+    -> device_mdspan<const uint8_t, extent_2d<IdxT>, row_major>
+  {
+    return pq_dataset_.view();
+  }
+
+  /** Inverted list indices: ids of items in the source data [size] */
+  inline auto indices() noexcept -> device_mdspan<IdxT, extent_1d<IdxT>, row_major>
+  {
+    return indices_.view();
+  }
+  [[nodiscard]] inline auto indices() const noexcept
+    -> device_mdspan<const IdxT, extent_1d<IdxT>, row_major>
+  {
+    return indices_.view();
+  }
+
+  /** The transform matrix (original space -> rotated padded space) [rot_dim, dim] */
+  inline auto rotation_matrix() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
+  {
+    return rotation_matrix_.view();
+  }
+  [[nodiscard]] inline auto rotation_matrix() const noexcept
+    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
+  {
+    return rotation_matrix_.view();
+  }
+
+  /**
+   * Offsets into the lists [n_lists + 1].
+   * The last value contains the total length of the index.
+   */
+  inline auto list_offsets() noexcept -> device_mdspan<IdxT, extent_1d<uint32_t>, row_major>
+  {
+    return list_offsets_.view();
+  }
+  [[nodiscard]] inline auto list_offsets() const noexcept
+    -> device_mdspan<const IdxT, extent_1d<uint32_t>, row_major>
+  {
+    return list_offsets_.view();
+  }
+
+  /** Cluster centers corresponding to the lists in the original space [n_lists, dim_ext] */
+  inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
+  {
+    return centers_.view();
+  }
+  [[nodiscard]] inline auto centers() const noexcept
+    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
+  {
+    return centers_.view();
+  }
+
+  /** Cluster centers corresponding to the lists in the rotated space [n_lists, rot_dim] */
+  inline auto centers_rot() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
+  {
+    return centers_rot_.view();
+  }
+  [[nodiscard]] inline auto centers_rot() const noexcept
+    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
+  {
+    return centers_rot_.view();
+  }
+
+ private:
+  raft::distance::DistanceType metric_;
+  codebook_gen codebook_kind_;
+  uint32_t n_lists_;
+  uint32_t dim_;
+  uint32_t pq_bits_;
+  uint32_t pq_dim_;
+  uint32_t n_nonempty_lists_;
+
+  device_mdarray<float, extent_3d<uint32_t>, row_major> pq_centers_;
+  device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
+  device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
+  device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
+  device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
+  device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
+  device_mdarray<float, extent_2d<uint32_t>, row_major> centers_rot_;
+
+  /** Throw an error if the index content is inconsistent. */
+  void check_consistency()
+  {
+    RAFT_EXPECTS(pq_bits() >= 4 && pq_bits() <= 8,
+                 "`pq_bits` must be within closed range [4,8], but got %u.",
+                 pq_bits());
+    RAFT_EXPECTS((pq_bits() * pq_dim()) % 8 == 0,
+                 "`pq_bits * pq_dim` must be a multiple of 8, but got %u * %u = %u.",
+                 pq_bits(),
+                 pq_dim(),
+                 pq_bits() * pq_dim());
+  }
+
+  auto make_pq_centers_extents() -> extent_3d<uint32_t>
+  {
+    switch (codebook_kind()) {
+      case codebook_gen::PER_SUBSPACE:
+        return make_extents<uint32_t>(pq_dim(), pq_book_size(), pq_len());
+      case codebook_gen::PER_CLUSTER:
+        return make_extents<uint32_t>(n_lists(), pq_book_size(), pq_len());
+      default: RAFT_FAIL("Unreachable code");
+    }
+  }
+
+  static inline auto calculate_pq_dim(uint32_t dim) -> uint32_t
+  {
+    // If the dimensionality is large enough, we can reduce it to improve performance
+    if (dim >= 128) { dim /= 2; }
+    // Round it down to 32 to improve performance.
+    uint32_t r = raft::round_down_safe<uint32_t>(dim, 32);
+    if (r > 0) return r;
+    // If the dimensionality is really low, round it to the closest power-of-two
+    r = 1;
+    while ((r << 1) <= dim) {
+      r = r << 1;
+    }
+    return r;
+  }
+};
+
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
new file mode 100644
index 0000000000..0511bbbf6c
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KNN_SPECIALIZATIONS_H
+#define __KNN_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/neighbors/specializations/ball_cover.cuh>
+#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+#include <raft/neighbors/specializations/knn.cuh>
+
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
+
+#endif
diff --git a/cpp/include/raft/spatial/knn/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
similarity index 72%
rename from cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
rename to cpp/include/raft/neighbors/specializations/ball_cover.cuh
index a861375b2f..f20d1adc35 100644
--- a/cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -16,23 +16,20 @@
 
 #pragma once
 
-#include <raft/spatial/knn/ball_cover.cuh>
-#include <raft/spatial/knn/ball_cover_types.hpp>
-#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
+#include <raft/neighbors/ball_cover.cuh>
+#include <raft/neighbors/ball_cover_types.hpp>
 
 #include <cstdint>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::neighbors::ball_cover {
 extern template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
 extern template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
 
-extern template void rbc_build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
+extern template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
 
-extern template void rbc_knn_query<std::int64_t, float, std::uint32_t>(
+extern template void knn_query<std::int64_t, float, std::uint32_t>(
   const raft::handle_t& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
@@ -43,7 +40,7 @@ extern template void rbc_knn_query<std::int64_t, float, std::uint32_t>(
   bool perform_post_filtering,
   float weight);
 
-extern template void rbc_all_knn_query<std::int64_t, float, std::uint32_t, std::uint32_t>(
+extern template void all_knn_query<std::int64_t, float, std::uint32_t, std::uint32_t>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
@@ -51,6 +48,5 @@ extern template void rbc_all_knn_query<std::int64_t, float, std::uint32_t, std::
   float* dists,
   bool perform_post_filtering,
   float weight);
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
+
+};  // namespace raft::neighbors::ball_cover
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp b/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
similarity index 100%
rename from cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
rename to cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
diff --git a/cpp/include/raft/spatial/knn/specializations/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/specializations/detail/ivf_pq_search.cuh
rename to cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
diff --git a/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
rename to cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/neighbors/specializations/knn.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/specializations/knn.cuh
rename to cpp/include/raft/neighbors/specializations/knn.cuh
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 4c24dcbc29..3e17b557f2 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -39,8 +39,19 @@
 
 namespace raft::solver {
 
+/**
+ * @brief CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm
+ * @note This is a port to RAFT from original authors Ketan Date and Rakesh Nagi
+ *
+ * @see Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms
+ *          for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72.
+ *
+ * @tparam vertex_t
+ * @tparam weight_t
+ */
 template <typename vertex_t, typename weight_t>
 class LinearAssignmentProblem {
+ private:
   vertex_t size_;
   vertex_t batchsize_;
   weight_t epsilon_;
@@ -66,6 +77,13 @@ class LinearAssignmentProblem {
   rmm::device_uvector<weight_t> obj_val_dual_v;
 
  public:
+  /**
+   * @brief Constructor
+   * @param handle raft handle for managing resources
+   * @param size size of square matrix
+   * @param batchsize
+   * @param epsilon
+   */
   LinearAssignmentProblem(raft::handle_t const& handle,
                           vertex_t size,
                           vertex_t batchsize,
@@ -91,7 +109,12 @@ class LinearAssignmentProblem {
   {
   }
 
-  // Executes Hungarian algorithm on the input cost matrix.
+  /**
+   * Executes Hungarian algorithm on the input cost matrix.
+   * @param d_cost_matrix
+   * @param d_row_assignment
+   * @param d_col_assignment
+   */
   void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
   {
     initializeDevice();
@@ -118,19 +141,31 @@ class LinearAssignmentProblem {
     d_costs_ = nullptr;
   }
 
-  // Function for getting optimal row dual vector for subproblem spId.
+  /**
+   * Function for getting optimal row dual vector for subproblem spId.
+   * @param spId
+   * @return
+   */
   std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
   {
     return std::make_pair(row_duals_v.data() + spId * size_, size_);
   }
 
-  // Function for getting optimal col dual vector for subproblem spId.
+  /**
+   * Function for getting optimal col dual vector for subproblem spId.
+   * @param spId
+   * @return
+   */
   std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
   {
     return std::make_pair(col_duals_v.data() + spId * size_, size_);
   }
 
-  // Function for getting optimal primal objective value for subproblem spId.
+  /**
+   * Function for getting optimal primal objective value for subproblem spId.
+   * @param spId
+   * @return
+   */
   weight_t getPrimalObjectiveValue(int spId)
   {
     weight_t result;
@@ -139,7 +174,11 @@ class LinearAssignmentProblem {
     return result;
   }
 
-  // Function for getting optimal dual objective value for subproblem spId.
+  /**
+   * Function for getting optimal dual objective value for subproblem spId.
+   * @param spId
+   * @return
+   */
   weight_t getDualObjectiveValue(int spId)
   {
     weight_t result;
diff --git a/cpp/include/raft/sparse/spatial/knn.cuh b/cpp/include/raft/sparse/neighbors/brute_force.cuh
similarity index 73%
rename from cpp/include/raft/sparse/spatial/knn.cuh
rename to cpp/include/raft/sparse/neighbors/brute_force.cuh
index 1e8a08ec96..9639ddc24c 100644
--- a/cpp/include/raft/sparse/spatial/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/brute_force.cuh
@@ -17,9 +17,9 @@
 
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
-#include <raft/sparse/spatial/detail/knn.cuh>
+#include <raft/sparse/neighbors/detail/knn.cuh>
 
-namespace raft::sparse::spatial {
+namespace raft::sparse::neighbors::brute_force {
 
 /**
  * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
@@ -45,27 +45,27 @@ namespace raft::sparse::spatial {
  * @param[in] metric distance metric/measure to use
  * @param[in] metricArg potential argument for metric (currently unused)
  */
-template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx* idxIndptr,
-                     const value_idx* idxIndices,
-                     const value_t* idxData,
-                     size_t idxNNZ,
-                     int n_idx_rows,
-                     int n_idx_cols,
-                     const value_idx* queryIndptr,
-                     const value_idx* queryIndices,
-                     const value_t* queryData,
-                     size_t queryNNZ,
-                     int n_query_rows,
-                     int n_query_cols,
-                     value_idx* output_indices,
-                     value_t* output_dists,
-                     int k,
-                     const raft::handle_t& handle,
-                     size_t batch_size_index             = 2 << 14,  // approx 1M
-                     size_t batch_size_query             = 2 << 14,
-                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-                     float metricArg                     = 0)
+template <typename value_idx = int, typename value_t = float>
+void knn(const value_idx* idxIndptr,
+         const value_idx* idxIndices,
+         const value_t* idxData,
+         size_t idxNNZ,
+         int n_idx_rows,
+         int n_idx_cols,
+         const value_idx* queryIndptr,
+         const value_idx* queryIndices,
+         const value_t* queryData,
+         size_t queryNNZ,
+         int n_query_rows,
+         int n_query_cols,
+         value_idx* output_indices,
+         value_t* output_dists,
+         int k,
+         const raft::handle_t& handle,
+         size_t batch_size_index             = 2 << 14,  // approx 1M
+         size_t batch_size_query             = 2 << 14,
+         raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+         float metricArg                     = 0)
 {
   detail::sparse_knn_t<value_idx, value_t>(idxIndptr,
                                            idxIndices,
@@ -90,4 +90,4 @@ void brute_force_knn(const value_idx* idxIndptr,
     .run();
 }
 
-};  // namespace raft::sparse::spatial
+};  // namespace raft::sparse::neighbors::brute_force
diff --git a/cpp/include/raft/sparse/spatial/connect_components.cuh b/cpp/include/raft/sparse/neighbors/connect_components.cuh
similarity index 95%
rename from cpp/include/raft/sparse/spatial/connect_components.cuh
rename to cpp/include/raft/sparse/neighbors/connect_components.cuh
index 60c0bba1de..e468643518 100644
--- a/cpp/include/raft/sparse/spatial/connect_components.cuh
+++ b/cpp/include/raft/sparse/neighbors/connect_components.cuh
@@ -19,9 +19,9 @@
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/spatial/detail/connect_components.cuh>
+#include <raft/sparse/neighbors/detail/connect_components.cuh>
 
-namespace raft::sparse::spatial {
+namespace raft::sparse::neighbors {
 
 template <typename value_idx, typename value_t>
 using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp<value_idx, value_t>;
@@ -76,4 +76,4 @@ void connect_components(
   detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric);
 }
 
-};  // end namespace raft::sparse::spatial
\ No newline at end of file
+};  // end namespace raft::sparse::neighbors
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/spatial/detail/connect_components.cuh b/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
similarity index 99%
rename from cpp/include/raft/sparse/spatial/detail/connect_components.cuh
rename to cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
index 1c14669e28..38ba1137ac 100644
--- a/cpp/include/raft/sparse/spatial/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
@@ -45,7 +45,7 @@
 
 #include <limits>
 
-namespace raft::sparse::spatial::detail {
+namespace raft::sparse::neighbors::detail {
 
 /**
  * Functor with reduction ops for performing fused 1-nn
@@ -401,4 +401,4 @@ void connect_components(
     handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
 }
 
-};  // end namespace raft::sparse::spatial::detail
+};  // end namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/sparse/spatial/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
similarity index 99%
rename from cpp/include/raft/sparse/spatial/detail/knn.cuh
rename to cpp/include/raft/sparse/neighbors/detail/knn.cuh
index aa933cd680..38e67036fe 100644
--- a/cpp/include/raft/sparse/spatial/detail/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
@@ -33,7 +33,7 @@
 
 #include <algorithm>
 
-namespace raft::sparse::spatial::detail {
+namespace raft::sparse::neighbors::detail {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
@@ -425,4 +425,4 @@ class sparse_knn_t {
   const raft::handle_t& handle;
 };
 
-};  // namespace raft::sparse::spatial::detail
\ No newline at end of file
+};  // namespace raft::sparse::neighbors::detail
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/spatial/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
similarity index 98%
rename from cpp/include/raft/sparse/spatial/detail/knn_graph.cuh
rename to cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index 1331393719..ffd742f080 100644
--- a/cpp/include/raft/sparse/spatial/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -35,7 +35,7 @@
 #include <algorithm>
 #include <limits>
 
-namespace raft::sparse::spatial::detail {
+namespace raft::sparse::neighbors::detail {
 
 /**
  * Fills indices array of pairwise distance array
@@ -147,4 +147,4 @@ void knn_graph(const handle_t& handle,
     handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
 }
 
-};  // namespace raft::sparse::spatial::detail
+};  // namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/sparse/neighbors/knn.cuh b/cpp/include/raft/sparse/neighbors/knn.cuh
new file mode 100644
index 0000000000..14404adcb4
--- /dev/null
+++ b/cpp/include/raft/sparse/neighbors/knn.cuh
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+/**
+ * DISCLAIMER: this file is deprecated: use knn.cuh instead
+ */
+
+#pragma once
+
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the sparse/spatial version instead.")
+
+#include <raft/sparse/neighbors/brute_force.cuh>
+
+namespace raft::sparse::neighbors {
+
+/**
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNZ number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] n_idx_cols
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] handle CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
+template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  brute_force::knn<value_idx, value_t>(idxIndptr,
+                                       idxIndices,
+                                       idxData,
+                                       idxNNZ,
+                                       n_idx_rows,
+                                       n_idx_cols,
+                                       queryIndptr,
+                                       queryIndices,
+                                       queryData,
+                                       queryNNZ,
+                                       n_query_rows,
+                                       n_query_cols,
+                                       output_indices,
+                                       output_dists,
+                                       k,
+                                       handle,
+                                       batch_size_index,
+                                       batch_size_query,
+                                       metric,
+                                       metricArg);
+}
+
+};  // namespace raft::sparse::neighbors
diff --git a/cpp/include/raft/sparse/spatial/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
similarity index 92%
rename from cpp/include/raft/sparse/spatial/knn_graph.cuh
rename to cpp/include/raft/sparse/neighbors/knn_graph.cuh
index 9694e6a293..582df703db 100644
--- a/cpp/include/raft/sparse/spatial/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
@@ -18,11 +18,11 @@
 
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/spatial/detail/knn_graph.cuh>
+#include <raft/sparse/neighbors/detail/knn_graph.cuh>
 
 #include <cstdint>
 
-namespace raft::sparse::spatial {
+namespace raft::sparse::neighbors {
 
 /**
  * Constructs a (symmetrized) knn graph edge list from
@@ -52,4 +52,4 @@ void knn_graph(const handle_t& handle,
   detail::knn_graph(handle, X, m, n, metric, out, c);
 }
 
-};  // namespace raft::sparse::spatial
+};  // namespace raft::sparse::neighbors
diff --git a/cpp/include/raft/sparse/neighbors/specializations.cuh b/cpp/include/raft/sparse/neighbors/specializations.cuh
new file mode 100644
index 0000000000..23ba38ccda
--- /dev/null
+++ b/cpp/include/raft/sparse/neighbors/specializations.cuh
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 22d8d7e936..c4479bc451 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -28,10 +28,10 @@
                 " is deprecated and will be removed in a future release." \
                 " Please use the sparse/spatial version instead.")
 
-#include <raft/sparse/spatial/connect_components.cuh>
+#include <raft/sparse/neighbors/connect_components.cuh>
 
 namespace raft::linkage {
-using raft::sparse::spatial::connect_components;
-using raft::sparse::spatial::FixConnectivitiesRedOp;
-using raft::sparse::spatial::get_n_components;
+using raft::sparse::neighbors::connect_components;
+using raft::sparse::neighbors::FixConnectivitiesRedOp;
+using raft::sparse::neighbors::get_n_components;
 }  // namespace raft::linkage
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index f6895addd1..c5b6a7ab2f 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -28,8 +28,8 @@
                 " is deprecated and will be removed in a future release." \
                 " Please use the sparse/spatial version instead.")
 
-#include <raft/sparse/spatial/knn.cuh>
+#include <raft/sparse/neighbors/knn.cuh>
 
 namespace raft::sparse::selection {
-using raft::sparse::spatial::brute_force_knn;
+using raft::sparse::neighbors::brute_force_knn;
 }
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 54cc52f4ae..bd009bf297 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -28,8 +28,8 @@
                 " is deprecated and will be removed in a future release." \
                 " Please use the sparse/spatial version instead.")
 
-#include <raft/sparse/spatial/knn_graph.cuh>
+#include <raft/sparse/neighbors/knn_graph.cuh>
 
 namespace raft::sparse::selection {
-using raft::sparse::spatial::knn_graph;
+using raft::sparse::neighbors::knn_graph;
 }
diff --git a/cpp/include/raft/sparse/solver/mst.cuh b/cpp/include/raft/sparse/solver/mst.cuh
index 33beeb1915..5f55a567ca 100644
--- a/cpp/include/raft/sparse/solver/mst.cuh
+++ b/cpp/include/raft/sparse/solver/mst.cuh
@@ -20,6 +20,29 @@
 
 namespace raft::sparse::solver {
 
+/**
+ * Compute the minimium spanning tree (MST) or minimum spanning forest (MSF) depending on
+ * the connected components of the given graph.
+ *
+ * @tparam vertex_t integral type for precision of vertex indexing
+ * @tparam edge_t integral type for precision of edge indexing
+ * @tparam weight_t type of weights array
+ * @tparam alteration_t type to use for random alteration
+ *
+ * @param handle
+ * @param offsets csr inptr array of row offsets (size v+1)
+ * @param indices csr array of column indices (size e)
+ * @param weights csr array of weights (size e)
+ * @param v number of vertices in graph
+ * @param e number of edges in graph
+ * @param color array to store resulting colors for MSF
+ * @param stream cuda stream for ordering operations
+ * @param symmetrize_output should the resulting output edge list should be symmetrized?
+ * @param initialize_colors should the colors array be initialized inside the MST?
+ * @param iterations maximum number of iterations to perform
+ * @return a list of edges containing the mst (or a subset of the edges guaranteed to be in the mst
+ * when an msf is encountered)
+ */
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
 Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
                                           edge_t const* offsets,
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index 9cb9b573b1..fdc2d41161 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,77 +13,33 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __BALL_COVER_H
-#define __BALL_COVER_H
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
 #pragma once
 
-#include <cstdint>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
 
-#include "ball_cover_types.hpp"
-#include "detail/ball_cover.cuh"
-#include "detail/ball_cover/common.cuh"
-#include <raft/distance/distance_types.hpp>
-#include <thrust/transform.h>
+#include <raft/neighbors/ball_cover.cuh>
+#include <raft/spatial/knn/ball_cover_types.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::spatial::knn {
 
-/**
- * Builds and populates a previously unbuilt BallCoverIndex
- * @tparam idx_t knn index type
- * @tparam value_t knn value type
- * @tparam int_t integral type for knn params
- * @tparam matrix_idx_t matrix indexing type
- * @param[in] handle library resource management handle
- * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
- */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
 void rbc_build_index(const raft::handle_t& handle,
                      BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
 {
-  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_build_index(handle, index, detail::HaversineFunc<value_t, int_t>());
-  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
-             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_build_index(handle, index, detail::EuclideanFunc<value_t, int_t>());
-  } else {
-    RAFT_FAIL("Metric not support");
-  }
-
-  index.set_index_trained();
+  raft::neighbors::ball_cover::build_index(handle, index);
 }
 
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * performs an all neighbors knn, which can reuse memory when
- * the index and query are the same array. This function will
- * build the index and assumes rbc_build_index() has not already
- * been called.
- * @tparam idx_t knn index type
- * @tparam value_t knn distance type
- * @tparam int_t type for integers, such as number of rows/cols
- * @param[in] handle raft handle for resource management
- * @param[inout] index ball cover index which has not yet been built
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
 void rbc_all_knn_query(const raft::handle_t& handle,
                        BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
@@ -93,114 +49,10 @@ void rbc_all_knn_query(const raft::handle_t& handle,
                        bool perform_post_filtering = true,
                        float weight                = 1.0)
 {
-  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_all_knn_query(handle,
-                              index,
-                              k,
-                              inds,
-                              dists,
-                              detail::HaversineFunc<value_t, int_t>(),
-                              perform_post_filtering,
-                              weight);
-  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
-             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_all_knn_query(handle,
-                              index,
-                              k,
-                              inds,
-                              dists,
-                              detail::EuclideanFunc<value_t, int_t>(),
-                              perform_post_filtering,
-                              weight);
-  } else {
-    RAFT_FAIL("Metric not supported");
-  }
-
-  index.set_index_trained();
-}
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * performs an all neighbors knn, which can reuse memory when
- * the index and query are the same array. This function will
- * build the index and assumes rbc_build_index() has not already
- * been called.
- * @tparam idx_t knn index type
- * @tparam value_t knn distance type
- * @tparam int_t type for integers, such as number of rows/cols
- * @tparam matrix_idx_t matrix indexing type
- * @param[in] handle raft handle for resource management
- * @param[in] index ball cover index which has not yet been built
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
-template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void rbc_all_knn_query(const raft::handle_t& handle,
-                       BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
-                       raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
-                       raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
-                       int_t k,
-                       bool perform_post_filtering = true,
-                       float weight                = 1.0)
-{
-  RAFT_EXPECTS(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  RAFT_EXPECTS(k <= index.m,
-               "k must be less than or equal to the number of data points in the index");
-  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<matrix_idx_t>(k),
-               "Number of columns in output indices and distances matrices must be equal to k");
-
-  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == index.get_X().extent(0),
-               "Number of rows in output indices and distances matrices must equal number of rows "
-               "in index matrix.");
-
-  rbc_all_knn_query(
-    handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight);
+  raft::neighbors::ball_cover::all_knn_query(
+    handle, index, k, inds, dists, perform_post_filtering, weight);
 }
 
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * function does not build the index and assumes rbc_build_index() has
- * already been called. Use this function when the index and
- * query arrays are different, otherwise use rbc_all_knn_query().
- * @tparam idx_t index type
- * @tparam value_t distances type
- * @tparam int_t integer type for size info
- * @param[in] handle raft handle for resource management
- * @param[inout] index ball cover index which has not yet been built
- * @param[in] k number of nearest neighbors to find
- * @param[in] query the
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- * @param[in] n_query_pts number of query points
- */
 template <typename idx_t, typename value_t, typename int_t>
 void rbc_knn_query(const raft::handle_t& handle,
                    const BallCoverIndex<idx_t, value_t, int_t>& index,
@@ -212,103 +64,7 @@ void rbc_knn_query(const raft::handle_t& handle,
                    bool perform_post_filtering = true,
                    float weight                = 1.0)
 {
-  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_knn_query(handle,
-                          index,
-                          k,
-                          query,
-                          n_query_pts,
-                          inds,
-                          dists,
-                          detail::HaversineFunc<value_t, int_t>(),
-                          perform_post_filtering,
-                          weight);
-  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
-             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_knn_query(handle,
-                          index,
-                          k,
-                          query,
-                          n_query_pts,
-                          inds,
-                          dists,
-                          detail::EuclideanFunc<value_t, int_t>(),
-                          perform_post_filtering,
-                          weight);
-  } else {
-    RAFT_FAIL("Metric not supported");
-  }
+  raft::neighbors::ball_cover::knn_query(
+    handle, index, k, query, n_query_pts, inds, dists, perform_post_filtering, weight);
 }
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * function does not build the index and assumes rbc_build_index() has
- * already been called. Use this function when the index and
- * query arrays are different, otherwise use rbc_all_knn_query().
- * @tparam idx_t index type
- * @tparam value_t distances type
- * @tparam int_t integer type for size info
- * @tparam matrix_idx_t
- * @param[in] handle raft handle for resource management
- * @param[in] index ball cover index which has not yet been built
- * @param[in] query device matrix containing query data points
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
-template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void rbc_knn_query(const raft::handle_t& handle,
-                   const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
-                   raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
-                   raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
-                   raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
-                   int_t k,
-                   bool perform_post_filtering = true,
-                   float weight                = 1.0)
-{
-  RAFT_EXPECTS(k <= index.m,
-               "k must be less than or equal to the number of data points in the index");
-  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<idx_t>(k),
-               "Number of columns in output indices and distances matrices must be equal to k");
-
-  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == query.extent(0),
-               "Number of rows in output indices and distances matrices must equal number of rows "
-               "in search matrix.");
-
-  RAFT_EXPECTS(query.extent(1) == index.get_X().extent(1),
-               "Number of columns in query and index matrices must match.");
-
-  rbc_knn_query(handle,
-                index,
-                k,
-                query.data_handle(),
-                query.extent(0),
-                inds.data_handle(),
-                dists.data_handle(),
-                perform_post_filtering,
-                weight);
-}
-
-// TODO: implement functions for:
-//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
-//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
-
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ball_cover_types.hpp b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
index 897bb4df5b..6ebdcd7877 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_types.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,153 +13,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include <cstdint>
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
 /**
- * Stores raw index data points, sampled landmarks, the 1-nns of index points
- * to their closest landmarks, and the ball radii of each landmark. This
- * class is intended to be constructed once and reused across subsequent
- * queries.
- * @tparam value_idx
- * @tparam value_t
- * @tparam value_int
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename value_idx,
-          typename value_t,
-          typename value_int  = std::uint32_t,
-          typename matrix_idx = std::uint32_t>
-class BallCoverIndex {
- public:
-  explicit BallCoverIndex(const raft::handle_t& handle_,
-                          const value_t* X_,
-                          value_int m_,
-                          value_int n_,
-                          raft::distance::DistanceType metric_)
-    : handle(handle_),
-      X(raft::make_device_matrix_view<const value_t, matrix_idx>(X_, m_, n_)),
-      m(m_),
-      n(n_),
-      metric(metric_),
-      /**
-       * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
-       *
-       * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
-       */
-      n_landmarks(sqrt(m_)),
-      R_indptr(raft::make_device_vector<value_idx, matrix_idx>(handle, sqrt(m_) + 1)),
-      R_1nn_cols(raft::make_device_vector<value_idx, matrix_idx>(handle, m_)),
-      R_1nn_dists(raft::make_device_vector<value_t, matrix_idx>(handle, m_)),
-      R_closest_landmark_dists(raft::make_device_vector<value_t, matrix_idx>(handle, m_)),
-      R(raft::make_device_matrix<value_t, matrix_idx>(handle, sqrt(m_), n_)),
-      R_radius(raft::make_device_vector<value_t, matrix_idx>(handle, sqrt(m_))),
-      index_trained(false)
-  {
-  }
-
-  explicit BallCoverIndex(const raft::handle_t& handle_,
-                          raft::device_matrix_view<const value_t, matrix_idx, row_major> X_,
-                          raft::distance::DistanceType metric_)
-    : handle(handle_),
-      X(X_),
-      m(X_.extent(0)),
-      n(X_.extent(1)),
-      metric(metric_),
-      /**
-       * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
-       *
-       * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
-       */
-      n_landmarks(sqrt(X_.extent(0))),
-      R_indptr(raft::make_device_vector<value_idx, matrix_idx>(handle, sqrt(X_.extent(0)) + 1)),
-      R_1nn_cols(raft::make_device_vector<value_idx, matrix_idx>(handle, X_.extent(0))),
-      R_1nn_dists(raft::make_device_vector<value_t, matrix_idx>(handle, X_.extent(0))),
-      R_closest_landmark_dists(raft::make_device_vector<value_t, matrix_idx>(handle, X_.extent(0))),
-      R(raft::make_device_matrix<value_t, matrix_idx>(handle, sqrt(X_.extent(0)), X_.extent(1))),
-      R_radius(raft::make_device_vector<value_t, matrix_idx>(handle, sqrt(X_.extent(0)))),
-      index_trained(false)
-  {
-  }
-
-  auto get_R_indptr() const -> raft::device_vector_view<const value_idx, matrix_idx>
-  {
-    return R_indptr.view();
-  }
-  auto get_R_1nn_cols() const -> raft::device_vector_view<const value_idx, matrix_idx>
-  {
-    return R_1nn_cols.view();
-  }
-  auto get_R_1nn_dists() const -> raft::device_vector_view<const value_t, matrix_idx>
-  {
-    return R_1nn_dists.view();
-  }
-  auto get_R_radius() const -> raft::device_vector_view<const value_t, matrix_idx>
-  {
-    return R_radius.view();
-  }
-  auto get_R() const -> raft::device_matrix_view<const value_t, matrix_idx, row_major>
-  {
-    return R.view();
-  }
-  auto get_R_closest_landmark_dists() const -> raft::device_vector_view<const value_t, matrix_idx>
-  {
-    return R_closest_landmark_dists.view();
-  }
-
-  raft::device_vector_view<value_idx, matrix_idx> get_R_indptr() { return R_indptr.view(); }
-  raft::device_vector_view<value_idx, matrix_idx> get_R_1nn_cols() { return R_1nn_cols.view(); }
-  raft::device_vector_view<value_t, matrix_idx> get_R_1nn_dists() { return R_1nn_dists.view(); }
-  raft::device_vector_view<value_t, matrix_idx> get_R_radius() { return R_radius.view(); }
-  raft::device_matrix_view<value_t, matrix_idx, row_major> get_R() { return R.view(); }
-  raft::device_vector_view<value_t, matrix_idx> get_R_closest_landmark_dists()
-  {
-    return R_closest_landmark_dists.view();
-  }
-  raft::device_matrix_view<const value_t, matrix_idx, row_major> get_X() const { return X; }
 
-  raft::distance::DistanceType get_metric() const { return metric; }
-
-  value_int get_n_landmarks() const { return n_landmarks; }
-  bool is_index_trained() const { return index_trained; };
-
-  // This should only be set by internal functions
-  void set_index_trained() { index_trained = true; }
-
-  const raft::handle_t& handle;
-
-  value_int m;
-  value_int n;
-  value_int n_landmarks;
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
-  raft::device_matrix_view<const value_t, matrix_idx, row_major> X;
+#pragma once
 
-  raft::distance::DistanceType metric;
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
 
- private:
-  // CSR storing the neighborhoods for each data point
-  raft::device_vector<value_idx, matrix_idx> R_indptr;
-  raft::device_vector<value_idx, matrix_idx> R_1nn_cols;
-  raft::device_vector<value_t, matrix_idx> R_1nn_dists;
-  raft::device_vector<value_t, matrix_idx> R_closest_landmark_dists;
+#include <raft/neighbors/ball_cover_types.hpp>
 
-  raft::device_vector<value_t, matrix_idx> R_radius;
+namespace raft::spatial::knn {
 
-  raft::device_matrix<value_t, matrix_idx, row_major> R;
+using raft::neighbors::ball_cover::BallCoverIndex;
 
- protected:
-  bool index_trained;
-};
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/brute_force.cuh b/cpp/include/raft/spatial/knn/brute_force.cuh
deleted file mode 100644
index dda1e02eed..0000000000
--- a/cpp/include/raft/spatial/knn/brute_force.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "detail/knn_brute_force_faiss.cuh"
-#include "detail/selection_faiss.cuh"
-#include <raft/core/device_mdspan.hpp>
-
-namespace raft::spatial::knn {
-
-/**
- * @brief Performs a k-select across row partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- *
- * etc...
- *
- * @tparam idx_t
- * @tparam value_t
- * @param[in] handle
- * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k)
- * @param[in] in_values matrix of input values (size n_samples * n_parts * k)
- * @param[out] out_keys matrix of output keys (size n_samples * k)
- * @param[out] out_values matrix of output values (size n_samples * k)
- * @param[in] n_samples number of rows in each part
- * @param[in] translations optional vector of starting index mappings for each partition
- */
-template <typename idx_t, typename value_t>
-inline void knn_merge_parts(
-  const raft::handle_t& handle,
-  raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
-  raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
-  raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
-  raft::device_matrix_view<idx_t, idx_t, row_major> out_values,
-  size_t n_samples,
-  std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt)
-{
-  RAFT_EXPECTS(in_keys.extent(1) == in_values.extent(1) && in_keys.extent(0) == in_values.extent(0),
-               "in_keys and in_values must have the same shape.");
-  RAFT_EXPECTS(
-    out_keys.extent(0) == out_values.extent(0) == n_samples,
-    "Number of rows in output keys and val matrices must equal number of rows in search matrix.");
-  RAFT_EXPECTS(out_keys.extent(1) == out_values.extent(1) == in_keys.extent(1),
-               "Number of columns in output indices and distances matrices must be equal to k");
-
-  auto n_parts = in_keys.extent(0) / n_samples;
-  detail::knn_merge_parts(in_keys.data_handle(),
-                          in_values.data_handle(),
-                          out_keys.data_handle(),
-                          out_values.data_handle(),
-                          n_samples,
-                          n_parts,
-                          in_keys.extent(1),
-                          handle.get_stream(),
-                          translations.value_or(nullptr));
-}
-
-/**
- * @brief Flat C++ API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances. Inputs can be either
- * row- or column-major but the output matrices will always be in
- * row-major format.
- *
- * @param[in] handle the cuml handle to use
- * @param[in] index vector of device matrices (each size m_i*d) to be used as the knn index
- * @param[in] search matrix (size n*d) to be used for searching the index
- * @param[out] indices matrix (size n*k) to store output knn indices
- * @param[out] distances matrix (size n*k) to store the output knn distance
- * @param[in] k the number of nearest neighbors to return
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
- * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] translations starting offsets for partitions. should be the same size
- *            as input vector.
- */
-template <typename idx_t,
-          typename value_t,
-          typename value_int,
-          typename matrix_idx,
-          typename index_layout,
-          typename search_layout>
-void brute_force_knn(
-  raft::handle_t const& handle,
-  std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
-  raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
-  raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
-  raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
-  value_int k,
-  distance::DistanceType metric                  = distance::DistanceType::L2Unexpanded,
-  std::optional<float> metric_arg                = std::make_optional<float>(2.0f),
-  std::optional<std::vector<idx_t>> translations = std::nullopt)
-{
-  RAFT_EXPECTS(index[0].extent(1) == search.extent(1),
-               "Number of dimensions for both index and search matrices must be equal");
-
-  RAFT_EXPECTS(indices.extent(0) == distances.extent(0) && distances.extent(0) == search.extent(0),
-               "Number of rows in output indices and distances matrices must equal number of rows "
-               "in search matrix.");
-  RAFT_EXPECTS(
-    indices.extent(1) == distances.extent(1) && distances.extent(1) == static_cast<matrix_idx>(k),
-    "Number of columns in output indices and distances matrices must be equal to k");
-
-  bool rowMajorIndex = std::is_same_v<index_layout, layout_c_contiguous>;
-  bool rowMajorQuery = std::is_same_v<search_layout, layout_c_contiguous>;
-
-  std::vector<value_t*> inputs;
-  std::vector<value_int> sizes;
-  for (std::size_t i = 0; i < index.size(); ++i) {
-    inputs.push_back(const_cast<value_t*>(index[i].data_handle()));
-    sizes.push_back(index[i].extent(0));
-  }
-
-  std::vector<idx_t>* trans = translations.has_value() ? &(*translations) : nullptr;
-
-  detail::brute_force_knn_impl(handle,
-                               inputs,
-                               sizes,
-                               static_cast<value_int>(index[0].extent(1)),
-                               // TODO: This is unfortunate. Need to fix.
-                               const_cast<value_t*>(search.data_handle()),
-                               static_cast<value_int>(search.extent(0)),
-                               indices.data_handle(),
-                               distances.data_handle(),
-                               k,
-                               rowMajorIndex,
-                               rowMajorQuery,
-                               trans,
-                               metric,
-                               metric_arg.value_or(2.0f));
-}
-
-}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index bf0df065b2..ff4708bb7b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -21,13 +21,16 @@
 #include <thrust/gather.h>
 #include <thrust/transform.h>
 
+#include <raft/cluster/detail/kmeans_common.cuh>
 #include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/add.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
@@ -207,14 +210,7 @@ constexpr inline auto calc_minibatch_size(uint32_t n_clusters,
  * multiple times with different datasets with the same effect as if calling this function once
  * on the combined dataset_.
  *
- * NB: `centers` and `cluster_sizes` must be accessible on GPU due to
- * divide_along_rows/normalize_rows. The rest can be both, under assumption that all pointers are
- * accessible from the same place.
- *
- * i.e. two variants are possible:
- *
- *   1. All pointers are on the device.
- *   2. All pointers are on the host, but `centers` and `cluster_sizes` are accessible from GPU.
+ * NB: all pointers must be accessible on the device.
  *
  * @tparam T      element type
  * @tparam IdxT   index type
@@ -231,9 +227,11 @@ constexpr inline auto calc_minibatch_size(uint32_t n_clusters,
  *    When set to `false`, this function may be used to update existing centers and sizes using
  *    the weighted average principle.
  * @param stream
+ * @param mr (optional) memory resource to use for temporary allocations on the device
  */
 template <typename T, typename IdxT, typename LabelT>
-void calc_centers_and_sizes(float* centers,
+void calc_centers_and_sizes(const handle_t& handle,
+                            float* centers,
                             uint32_t* cluster_sizes,
                             uint32_t n_clusters,
                             uint32_t dim,
@@ -241,12 +239,12 @@ void calc_centers_and_sizes(float* centers,
                             IdxT n_rows,
                             const LabelT* labels,
                             bool reset_counters,
-                            rmm::cuda_stream_view stream)
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr = nullptr)
 {
-  if (reset_counters) {
-    utils::memzero(centers, n_clusters * dim, stream);
-    utils::memzero(cluster_sizes, n_clusters, stream);
-  } else {
+  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+
+  if (!reset_counters) {
     utils::map_along_rows(
       n_clusters,
       dim,
@@ -255,13 +253,70 @@ void calc_centers_and_sizes(float* centers,
       [] __device__(float c, uint32_t s) -> float { return c * s; },
       stream);
   }
-  utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
-  utils::map_along_rows(
-    n_clusters,
-    dim,
+
+  rmm::device_uvector<char> workspace(0, stream, mr);
+  rmm::device_uvector<float> cluster_sizes_f(n_clusters, stream, mr);
+  float* sizes_f = cluster_sizes_f.data();
+
+  // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
+  // If we don't reset, we compute in a temporary buffer and add in a separate step.
+  rmm::device_uvector<uint32_t> temp_cluster_sizes(0, stream, mr);
+  uint32_t* temp_sizes = cluster_sizes;
+  if (!reset_counters) {
+    temp_cluster_sizes.resize(n_clusters, stream);
+    temp_sizes = temp_cluster_sizes.data();
+  }
+
+  utils::mapping<float> mapping_op;
+  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(dataset,
+                                                                                  mapping_op);
+
+  // todo(lsugy): use iterator from KV output of fusedL2NN
+  raft::linalg::reduce_rows_by_key(mapping_itr,
+                                   static_cast<int64_t>(dim),
+                                   labels,
+                                   nullptr,
+                                   static_cast<int64_t>(n_rows),
+                                   static_cast<int64_t>(dim),
+                                   static_cast<int64_t>(n_clusters),
+                                   centers,
+                                   stream,
+                                   reset_counters);
+
+  // Compute weight of each cluster
+  raft::cluster::detail::countLabels(handle,
+                                     labels,
+                                     temp_sizes,
+                                     static_cast<int64_t>(n_rows),
+                                     static_cast<int64_t>(n_clusters),
+                                     workspace);
+
+  // Add previous sizes if necessary and cast to float
+  auto counting = thrust::make_counting_iterator<int>(0);
+  thrust::for_each(
+    handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) {
+      uint32_t temp_size = temp_sizes[idx];
+      if (!reset_counters) {
+        temp_size += cluster_sizes[idx];
+        cluster_sizes[idx] = temp_size;
+      }
+      sizes_f[idx] = static_cast<float>(temp_size);
+    });
+
+  raft::linalg::matrixVectorOp(
+    centers,
     centers,
-    cluster_sizes,
-    [] __device__(float c, uint32_t s) -> float { return s == 0 ? 0.0f : c / float(s); },
+    sizes_f,
+    static_cast<int64_t>(dim),
+    static_cast<int64_t>(n_clusters),
+    true,
+    false,
+    [=] __device__(float mat, float vec) {
+      if (vec == 0.0f)
+        return 0.0f;
+      else
+        return mat / vec;
+    },
     stream);
 }
 
@@ -627,7 +682,8 @@ void balancing_em_iters(const handle_t& handle,
                              device_memory,
                              dataset_norm);
     // M: Maximization step - calculate optimal cluster centers
-    calc_centers_and_sizes(cluster_centers,
+    calc_centers_and_sizes(handle,
+                           cluster_centers,
                            cluster_sizes,
                            n_clusters,
                            dim,
@@ -635,7 +691,8 @@ void balancing_em_iters(const handle_t& handle,
                            n_rows,
                            cluster_labels,
                            true,
-                           stream);
+                           stream,
+                           device_memory);
   }
 }
 
@@ -666,8 +723,17 @@ void build_clusters(const handle_t& handle,
   linalg::writeOnlyUnaryOp<LabelT, decltype(f), IdxT>(cluster_labels, n_rows, f, stream);
 
   // update centers to match the initialized labels.
-  calc_centers_and_sizes(
-    cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream);
+  calc_centers_and_sizes(handle,
+                         cluster_centers,
+                         cluster_sizes,
+                         n_clusters,
+                         dim,
+                         dataset,
+                         n_rows,
+                         cluster_labels,
+                         true,
+                         stream,
+                         device_memory);
 
   // run EM
   balancing_em_iters<T, IdxT, LabelT>(handle,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 8dda574314..dbd509216b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -112,13 +112,13 @@ struct mapping {
    * @{
    */
   template <typename S>
-  HDI auto operator()(const S& x) -> std::enable_if_t<std::is_same_v<S, T>, T>
+  HDI auto operator()(const S& x) const -> std::enable_if_t<std::is_same_v<S, T>, T>
   {
     return x;
   };
 
   template <typename S>
-  HDI auto operator()(const S& x) -> std::enable_if_t<!std::is_same_v<S, T>, T>
+  HDI auto operator()(const S& x) const -> std::enable_if_t<!std::is_same_v<S, T>, T>
   {
     constexpr double kMult = config<T>::kDivisor / config<S>::kDivisor;
     if constexpr (std::is_floating_point_v<S>) { return static_cast<T>(x * static_cast<S>(kMult)); }
@@ -259,72 +259,6 @@ inline void dots_along_rows(
    */
 }
 
-template <typename T, typename IdxT, typename LabelT>
-__global__ void accumulate_into_selected_kernel(IdxT n_rows,
-                                                uint32_t n_cols,
-                                                float* output,
-                                                uint32_t* selection_counters,
-                                                const T* input,
-                                                const LabelT* row_ids)
-{
-  IdxT gid = threadIdx.x + (blockDim.x * static_cast<IdxT>(blockIdx.x));
-  IdxT j   = gid % n_cols;
-  IdxT i   = gid / n_cols;
-  if (i >= n_rows) return;
-  IdxT l = static_cast<IdxT>(row_ids[i]);
-  if (j == 0) { atomicAdd(&(selection_counters[l]), 1); }
-  atomicAdd(&(output[j + n_cols * l]), mapping<float>{}(input[gid]));
-}
-
-/**
- * @brief Add all rows of input matrix into a selection of rows in the output matrix
- * (cast and possibly scale the data input type). Count the number of times every output
- * row was selected along the way.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param n_cols number of columns in all matrices
- * @param[out] output output matrix [..., n_cols]
- * @param[inout] selection_counters number of occurrences of each row id in row_ids [..., n_cols]
- * @param n_rows number of rows in the input
- * @param[in] input row-major input matrix [n_rows, n_cols]
- * @param[in] row_ids row indices in the output matrix [n_rows]
- */
-template <typename T, typename IdxT, typename LabelT>
-void accumulate_into_selected(IdxT n_rows,
-                              uint32_t n_cols,
-                              float* output,
-                              uint32_t* selection_counters,
-                              const T* input,
-                              const LabelT* row_ids,
-                              rmm::cuda_stream_view stream)
-{
-  switch (check_pointer_residency(output, input, selection_counters, row_ids)) {
-    case pointer_residency::host_and_device:
-    case pointer_residency::device_only: {
-      uint32_t block_dim = 128;
-      auto grid_dim =
-        static_cast<uint32_t>(ceildiv<IdxT>(n_rows * static_cast<IdxT>(n_cols), block_dim));
-      accumulate_into_selected_kernel<T><<<grid_dim, block_dim, 0, stream>>>(
-        n_rows, n_cols, output, selection_counters, input, row_ids);
-    } break;
-    case pointer_residency::host_only: {
-      stream.synchronize();
-      for (IdxT i = 0; i < n_rows; i++) {
-        IdxT l = static_cast<IdxT>(row_ids[i]);
-        selection_counters[l]++;
-        for (IdxT j = 0; j < n_cols; j++) {
-          output[j + n_cols * l] += mapping<float>{}(input[j + n_cols * i]);
-        }
-      }
-      stream.synchronize();
-    } break;
-    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
-  }
-}
-
 template <typename IdxT>
 __global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a)
 {
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index af1cb97d36..14f5ae4516 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -144,7 +144,8 @@ inline auto extend(const handle_t& handle,
   raft::copy(
     list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream);
 
-  kmeans::calc_centers_and_sizes(centers_ptr,
+  kmeans::calc_centers_and_sizes(handle,
+                                 centers_ptr,
                                  list_sizes_ptr,
                                  n_lists,
                                  dim,
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index f13dcd8cc6..0577d24349 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -41,6 +41,7 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <thrust/binary_search.h>
+#include <thrust/extrema.h>
 #include <thrust/functional.h>
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
@@ -430,26 +431,18 @@ auto calculate_offsets_and_indices(IdxT n_rows,
                                    IdxT* data_indices,
                                    rmm::cuda_stream_view stream) -> uint32_t
 {
-  auto exec_policy          = rmm::exec_policy(stream);
-  uint32_t max_cluster_size = 0;
-  rmm::device_scalar<uint32_t> max_cluster_size_dev_buf(stream);
-  auto max_cluster_size_dev = max_cluster_size_dev_buf.data();
-  update_device(max_cluster_size_dev, &max_cluster_size, 1, stream);
+  auto exec_policy = rmm::exec_policy(stream);
   // Calculate the offsets
   IdxT cumsum = 0;
   update_device(cluster_offsets, &cumsum, 1, stream);
-  thrust::inclusive_scan(exec_policy,
-                         cluster_sizes,
-                         cluster_sizes + n_lists,
-                         cluster_offsets + 1,
-                         [max_cluster_size_dev] __device__(IdxT s, uint32_t l) {
-                           atomicMax(max_cluster_size_dev, l);
-                           return s + l;
-                         });
+  thrust::inclusive_scan(
+    exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, thrust::plus<IdxT>{});
   update_host(&cumsum, cluster_offsets + n_lists, 1, stream);
-  update_host(&max_cluster_size, max_cluster_size_dev, 1, stream);
+  uint32_t max_cluster_size =
+    *thrust::max_element(exec_policy, cluster_sizes, cluster_sizes + n_lists);
   stream.synchronize();
   RAFT_EXPECTS(cumsum == n_rows, "cluster sizes do not add up.");
+  RAFT_LOG_DEBUG("Max cluster size %d", max_cluster_size);
   rmm::device_uvector<IdxT> data_offsets_buf(n_lists, stream);
   auto data_offsets = data_offsets_buf.data();
   copy(data_offsets, cluster_offsets, n_lists, stream);
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
index 53fe76fada..e0a63ee42a 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
@@ -13,90 +13,26 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
 
-#ifndef __EPSILON_NEIGH_H
-#define __EPSILON_NEIGH_H
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
 
-namespace raft {
-namespace spatial {
-namespace knn {
+#include <raft/neighbors/epsilon_neighborhood.cuh>
 
-/**
- * @brief Computes epsilon neighborhood for the L2-Squared distance metric
- *
- * @tparam value_t   IO and math type
- * @tparam idx_t    Index type
- *
- * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
- * @param[out] vd     vertex degree array [on device] [len = m + 1]
- *                    `vd + m` stores the total number of edges in the adjacency
- *                    matrix. Pass a nullptr if you don't need this info.
- * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
- * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
- * @param[in]  m      number of rows in x
- * @param[in]  n      number of rows in y
- * @param[in]  k      number of columns in x and k
- * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
- *                    squared as we compute L2-squared distance in this method)
- * @param[in]  stream cuda stream
- */
-template <typename value_t, typename idx_t>
-void epsUnexpL2SqNeighborhood(bool* adj,
-                              idx_t* vd,
-                              const value_t* x,
-                              const value_t* y,
-                              idx_t m,
-                              idx_t n,
-                              idx_t k,
-                              value_t eps,
-                              cudaStream_t stream)
-{
-  detail::epsUnexpL2SqNeighborhood<value_t, idx_t>(adj, vd, x, y, m, n, k, eps, stream);
-}
-
-/**
- * @brief Computes epsilon neighborhood for the L2-Squared distance metric
- *
- * @tparam value_t   IO and math type
- * @tparam idx_t    Index type
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in]  handle raft handle to manage library resources
- * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
- * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
- * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
- * @param[out] vd     vertex degree array [on device] [len = m + 1]
- *                    `vd + m` stores the total number of edges in the adjacency
- *                    matrix. Pass a nullptr if you don't need this info.
- * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
- *                    squared as we compute L2-squared distance in this method)
- */
-template <typename value_t, typename idx_t, typename matrix_idx_t>
-void eps_neighbors_l2sq(const raft::handle_t& handle,
-                        raft::device_matrix_view<const value_t, matrix_idx_t, row_major> x,
-                        raft::device_matrix_view<const value_t, matrix_idx_t, row_major> y,
-                        raft::device_matrix_view<bool, matrix_idx_t, row_major> adj,
-                        raft::device_vector_view<idx_t, matrix_idx_t> vd,
-                        value_t eps)
-{
-  epsUnexpL2SqNeighborhood<value_t, idx_t>(adj.data_handle(),
-                                           vd.data_handle(),
-                                           x.data_handle(),
-                                           y.data_handle(),
-                                           x.extent(0),
-                                           y.extent(0),
-                                           x.extent(1),
-                                           eps,
-                                           handle.get_stream());
-}
+namespace raft::spatial::knn {
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+using raft::neighbors::epsilon_neighborhood::eps_neighbors_l2sq;
+using raft::neighbors::epsilon_neighborhood::epsUnexpL2SqNeighborhood;
 
-#endif
\ No newline at end of file
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 58ca96d392..92fe49be98 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,375 +13,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include "detail/ivf_flat_build.cuh"
-#include "detail/ivf_flat_search.cuh"
-#include "ivf_flat_types.hpp"
-
-#include <raft/core/handle.hpp>
-
-#include <raft/core/device_mdspan.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::spatial::knn::ivf_flat {
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::spatial::knn;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- * @param[in] n_rows the number of samples
- * @param[in] dim the dimensionality of the data
- *
- * @return the constructed ivf-flat index
- */
-template <typename T, typename IdxT>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<T, IdxT>
-{
-  return raft::spatial::knn::ivf_flat::detail::build(handle, params, dataset, n_rows, dim);
-}
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::spatial::knn;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- * @tparam int_t precision / type of integral arguments
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- *
- * @return the constructed ivf-flat index
- */
-template <typename value_t, typename idx_t>
-auto build_index(const handle_t& handle,
-                 raft::device_matrix_view<const value_t, idx_t, row_major> dataset,
-                 const index_params& params) -> index<value_t, idx_t>
-{
-  return raft::spatial::knn::ivf_flat::detail::build(handle,
-                                                     params,
-                                                     dataset.data_handle(),
-                                                     static_cast<idx_t>(dataset.extent(0)),
-                                                     static_cast<idx_t>(dataset.extent(1)));
-}
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are adjusted to match the newly labeled data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::spatial::knn;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows number of rows in `new_vectors`
- *
- * @return the constructed extended ivf-flat index
- */
-template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
-                   const index<T, IdxT>& orig_index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows) -> index<T, IdxT>
-{
-  return raft::spatial::knn::ivf_flat::detail::extend(
-    handle, orig_index, new_vectors, new_indices, n_rows);
-}
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are adjusted to match the newly labeled data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::spatial::knn;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- * @tparam int_t precision / type of integral arguments
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in] handle
- * @param[in] orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- *
- * @return the constructed extended ivf-flat index
- */
-template <typename value_t, typename idx_t>
-auto extend(const handle_t& handle,
-            const index<value_t, idx_t>& orig_index,
-            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
-  -> index<value_t, idx_t>
-{
-  return raft::spatial::knn::ivf_flat::detail::extend<value_t, idx_t>(
-    handle,
-    orig_index,
-    new_vectors.data_handle(),
-    new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-    new_vectors.extent(0));
-}
-
 /**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param handle
- * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename T, typename IdxT>
-inline void extend(const handle_t& handle,
-                   index<T, IdxT>* index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows)
-{
-  *index = extend(handle, *index, new_vectors, new_indices, n_rows);
-}
 
 /**
- * @brief Extend the index with the new data.
- * *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- * @tparam int_t precision / type of integral arguments
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in] handle
- * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
  */
-template <typename value_t, typename idx_t>
-void extend(const handle_t& handle,
-            index<value_t, idx_t>* index,
-            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
-{
-  *index = extend(handle,
-                  *index,
-                  new_vectors.data_handle(),
-                  new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                  static_cast<idx_t>(new_vectors.extent(0)));
-}
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] index ivf-flat constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[in] n_queries the batch size
- * @param[in] k the number of neighbors to find for each query.
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
- */
-template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
-                   const search_params& params,
-                   const index<T, IdxT>& index,
-                   const T* queries,
-                   uint32_t n_queries,
-                   uint32_t k,
-                   IdxT* neighbors,
-                   float* distances,
-                   rmm::mr::device_memory_resource* mr = nullptr)
-{
-  return raft::spatial::knn::ivf_flat::detail::search(
-    handle, params, index, queries, n_queries, k, neighbors, distances, mr);
-}
+#pragma once
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices
- * @tparam int_t precision / type of integral arguments
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in] handle
- * @param[in] index ivf-flat constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] params configure the search
- * @param[in] k the number of neighbors to find for each query.
- */
-template <typename value_t, typename idx_t, typename int_t>
-void search(const handle_t& handle,
-            const index<value_t, idx_t>& index,
-            raft::device_matrix_view<const value_t, idx_t, row_major> queries,
-            raft::device_matrix_view<idx_t, idx_t, row_major> neighbors,
-            raft::device_matrix_view<idx_t, idx_t, float> distances,
-            const search_params& params,
-            int_t k)
-{
-  RAFT_EXPECTS(
-    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
-    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
 
-  RAFT_EXPECTS(
-    neighbors.extent(1) == distances.extent(1) && neighbors.extent(1) == static_cast<idx_t>(k),
-    "Number of columns in output neighbors and distances matrices must equal k");
+#include <raft/neighbors/ivf_flat.cuh>
 
-  RAFT_EXPECTS(queries.extent(1) == index.dim(),
-               "Number of query dimensions should equal number of dimensions in the index.");
+namespace raft::spatial::knn::ivf_flat {
 
-  return raft::spatial::knn::ivf_flat::detail::search(handle,
-                                                      params,
-                                                      index,
-                                                      queries.data_handle(),
-                                                      queries.extent(0),
-                                                      k,
-                                                      neighbors.data_handle(),
-                                                      distances.data_handle(),
-                                                      nullptr);
-}
+using raft::neighbors::ivf_flat::build;
+using raft::neighbors::ivf_flat::extend;
+using raft::neighbors::ivf_flat::search;
 
-}  // namespace raft::spatial::knn::ivf_flat
+};  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 41fa1dd8ce..75d777573f 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,267 +13,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include "ann_types.hpp"
-
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/error.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/util/integer_utils.hpp>
-
-#include <optional>
-
-namespace raft::spatial::knn::ivf_flat {
-
-/** Size of the interleaved group (see `index::data` description). */
-constexpr static uint32_t kIndexGroupSize = 32;
-
-struct index_params : knn::index_params {
-  /** The number of inverted lists (clusters) */
-  uint32_t n_lists = 1024;
-  /** The number of iterations searching for kmeans centers (index building). */
-  uint32_t kmeans_n_iters = 20;
-  /** The fraction of data to use during iterative kmeans building. */
-  double kmeans_trainset_fraction = 0.5;
-};
-
-struct search_params : knn::search_params {
-  /** The number of clusters to search. */
-  uint32_t n_probes = 20;
-};
-
-static_assert(std::is_aggregate_v<index_params>);
-static_assert(std::is_aggregate_v<search_params>);
-
 /**
- * @brief IVF-flat index.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename T, typename IdxT>
-struct index : knn::index {
-  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
-                "IdxT must be able to represent all values of uint32_t");
-
- public:
-  /**
-   * Vectorized load/store size in elements, determines the size of interleaved data chunks.
-   *
-   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
-   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
-   */
-  [[nodiscard]] constexpr inline auto veclen() const noexcept -> uint32_t { return veclen_; }
-  /** Distance metric used for clustering. */
-  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
-  {
-    return metric_;
-  }
-  /**
-   * Inverted list data [size, dim].
-   *
-   * The data consists of the dataset rows, grouped by their labels (into clusters/lists).
-   * Within each list (cluster), the data is grouped into blocks of `kIndexGroupSize` interleaved
-   * vectors. Note, the total index length is slightly larger than the source dataset length,
-   * because each cluster is padded by `kIndexGroupSize` elements.
-   *
-   * Interleaving pattern:
-   * within groups of `kIndexGroupSize` rows, the data is interleaved with the block size equal to
-   * `veclen * sizeof(T)`. That is, a chunk of `veclen` consecutive components of one row is
-   * followed by a chunk of the same size of the next row, and so on.
-   *
-   * __Example__: veclen = 2, dim = 6, kIndexGroupSize = 32, list_size = 31
-   *
-   *     x[ 0, 0], x[ 0, 1], x[ 1, 0], x[ 1, 1], ... x[14, 0], x[14, 1], x[15, 0], x[15, 1],
-   *     x[16, 0], x[16, 1], x[17, 0], x[17, 1], ... x[30, 0], x[30, 1],    -    ,    -    ,
-   *     x[ 0, 2], x[ 0, 3], x[ 1, 2], x[ 1, 3], ... x[14, 2], x[14, 3], x[15, 2], x[15, 3],
-   *     x[16, 2], x[16, 3], x[17, 2], x[17, 3], ... x[30, 2], x[30, 3],    -    ,    -    ,
-   *     x[ 0, 4], x[ 0, 5], x[ 1, 4], x[ 1, 5], ... x[14, 4], x[14, 5], x[15, 4], x[15, 5],
-   *     x[16, 4], x[16, 5], x[17, 4], x[17, 5], ... x[30, 4], x[30, 5],    -    ,    -    ,
-   *
-   */
-  inline auto data() noexcept -> device_mdspan<T, extent_2d<IdxT>, row_major>
-  {
-    return data_.view();
-  }
-  [[nodiscard]] inline auto data() const noexcept
-    -> device_mdspan<const T, extent_2d<size_t>, row_major>
-  {
-    return data_.view();
-  }
-
-  /** Inverted list indices: ids of items in the source data [size] */
-  inline auto indices() noexcept -> device_mdspan<IdxT, extent_1d<IdxT>, row_major>
-  {
-    return indices_.view();
-  }
-  [[nodiscard]] inline auto indices() const noexcept
-    -> device_mdspan<const IdxT, extent_1d<IdxT>, row_major>
-  {
-    return indices_.view();
-  }
-
-  /** Sizes of the lists (clusters) [n_lists] */
-  inline auto list_sizes() noexcept -> device_mdspan<uint32_t, extent_1d<uint32_t>, row_major>
-  {
-    return list_sizes_.view();
-  }
-  [[nodiscard]] inline auto list_sizes() const noexcept
-    -> device_mdspan<const uint32_t, extent_1d<uint32_t>, row_major>
-  {
-    return list_sizes_.view();
-  }
-
-  /**
-   * Offsets into the lists [n_lists + 1].
-   * The last value contains the total length of the index.
-   */
-  inline auto list_offsets() noexcept -> device_mdspan<IdxT, extent_1d<uint32_t>, row_major>
-  {
-    return list_offsets_.view();
-  }
-  [[nodiscard]] inline auto list_offsets() const noexcept
-    -> device_mdspan<const IdxT, extent_1d<uint32_t>, row_major>
-  {
-    return list_offsets_.view();
-  }
-
-  /** k-means cluster centers corresponding to the lists [n_lists, dim] */
-  inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
-  {
-    return centers_.view();
-  }
-  [[nodiscard]] inline auto centers() const noexcept
-    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
-  {
-    return centers_.view();
-  }
-
-  /**
-   * (Optional) Precomputed norms of the `centers` w.r.t. the chosen distance metric [n_lists].
-   *
-   * NB: this may be empty if the index is empty or if the metric does not require the center norms
-   * calculation.
-   */
-  inline auto center_norms() noexcept
-    -> std::optional<device_mdspan<float, extent_1d<uint32_t>, row_major>>
-  {
-    if (center_norms_.has_value()) {
-      return std::make_optional<device_mdspan<float, extent_1d<uint32_t>, row_major>>(
-        center_norms_->view());
-    } else {
-      return std::nullopt;
-    }
-  }
-  [[nodiscard]] inline auto center_norms() const noexcept
-    -> std::optional<device_mdspan<const float, extent_1d<uint32_t>, row_major>>
-  {
-    if (center_norms_.has_value()) {
-      return std::make_optional<device_mdspan<const float, extent_1d<uint32_t>, row_major>>(
-        center_norms_->view());
-    } else {
-      return std::nullopt;
-    }
-  }
 
-  /** Total length of the index. */
-  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
-  /** Dimensionality of the data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
-  {
-    return centers_.extent(1);
-  }
-  /** Number of clusters/inverted lists. */
-  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t
-  {
-    return centers_.extent(0);
-  }
-
-  // Don't allow copying the index for performance reasons (try avoiding copying data)
-  index(const index&) = delete;
-  index(index&&)      = default;
-  auto operator=(const index&) -> index& = delete;
-  auto operator=(index&&) -> index& = default;
-  ~index()                          = default;
-
-  /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle, raft::distance::DistanceType metric, uint32_t n_lists, uint32_t dim)
-    : knn::index(),
-      veclen_(calculate_veclen(dim)),
-      metric_(metric),
-      data_(make_device_mdarray<T>(handle, make_extents<IdxT>(0, dim))),
-      indices_(make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))),
-      list_sizes_(make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(n_lists))),
-      list_offsets_(make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(n_lists + 1))),
-      centers_(make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists, dim))),
-      center_norms_(std::nullopt)
-  {
-    check_consistency();
-  }
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
-  /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle, const index_params& params, uint32_t dim)
-    : index(handle, params.metric, params.n_lists, dim)
-  {
-  }
+#pragma once
 
-  /**
-   * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
-   * of data.
-   */
-  void allocate(const handle_t& handle, IdxT index_size, bool allocate_center_norms)
-  {
-    data_    = make_device_mdarray<T>(handle, make_extents<IdxT>(index_size, dim()));
-    indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
-    center_norms_ =
-      allocate_center_norms
-        ? std::optional(make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists())))
-        : std::nullopt;
-    check_consistency();
-  }
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
 
- private:
-  /**
-   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
-   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
-   */
-  uint32_t veclen_;
-  raft::distance::DistanceType metric_;
-  device_mdarray<T, extent_2d<IdxT>, row_major> data_;
-  device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
-  device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
-  device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
-  device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
-  std::optional<device_mdarray<float, extent_1d<uint32_t>, row_major>> center_norms_;
+#include <raft/neighbors/ivf_flat_types.hpp>
 
-  /** Throw an error if the index content is inconsistent. */
-  void check_consistency()
-  {
-    RAFT_EXPECTS(dim() % veclen_ == 0, "dimensionality is not a multiple of the veclen");
-    RAFT_EXPECTS(data_.extent(0) == indices_.extent(0), "inconsistent index size");
-    RAFT_EXPECTS(data_.extent(1) == IdxT(centers_.extent(1)), "inconsistent data dimensionality");
-    RAFT_EXPECTS(                                               //
-      (centers_.extent(0) == list_sizes_.extent(0)) &&          //
-        (centers_.extent(0) + 1 == list_offsets_.extent(0)) &&  //
-        (!center_norms_.has_value() || centers_.extent(0) == center_norms_->extent(0)),
-      "inconsistent number of lists (clusters)");
-    RAFT_EXPECTS(reinterpret_cast<size_t>(data_.data_handle()) % (veclen_ * sizeof(T)) == 0,
-                 "The data storage pointer is not aligned to the vector length");
-  }
+namespace raft::spatial::knn::ivf_flat {
 
-  static auto calculate_veclen(uint32_t dim) -> uint32_t
-  {
-    // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
-    // template parameter (https://github.com/rapidsai/raft/issues/711)
-    uint32_t veclen = 16 / sizeof(T);
-    while (dim % veclen != 0) {
-      veclen = veclen >> 1;
-    }
-    return veclen;
-  }
-};
+using raft::neighbors::ivf_flat::index;
+using raft::neighbors::ivf_flat::index_params;
+using raft::neighbors::ivf_flat::kIndexGroupSize;
+using raft::neighbors::ivf_flat::search_params;
 
-}  // namespace raft::spatial::knn::ivf_flat
+};  // namespace raft::spatial::knn::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh
index 35cd408092..0f175f41bb 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,182 +13,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include "detail/ivf_pq_build.cuh"
-#include "detail/ivf_pq_search.cuh"
-#include "ivf_pq_types.hpp"
-
-#include <raft/core/handle.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::spatial::knn::ivf_pq {
-
 /**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::spatial::knn;
- *   // use default index parameters
- *   ivf_pq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param handle
- * @param params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- * @param n_rows the number of samples
- * @param dim the dimensionality of the data
- *
- * @return the constructed ivf-pq index
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename T, typename IdxT = uint32_t>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
-{
-  return raft::spatial::knn::ivf_pq::detail::build(handle, params, dataset, n_rows, dim);
-}
 
 /**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are unchanged.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::spatial::knn;
- *   ivf_pq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param handle
- * @param orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param n_rows the number of samples
- *
- * @return the constructed extended ivf-pq index
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
  */
-template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
-                   const index<IdxT>& orig_index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows) -> index<IdxT>
-{
-  return raft::spatial::knn::ivf_pq::detail::extend(
-    handle, orig_index, new_vectors, new_indices, n_rows);
-}
 
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param handle
- * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param n_rows the number of samples
- */
-template <typename T, typename IdxT>
-inline void extend(const handle_t& handle,
-                   index<IdxT>* index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows)
-{
-  *index = extend(handle, *index, new_vectors, new_indices, n_rows);
-}
+#pragma once
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param handle
- * @param params configure the search
- * @param index ivf-pq constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param n_queries the batch size
- * @param k the number of neighbors to find for each query.
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param mr an optional memory resource to use across the searches (you can provide a large enough
- *           memory pool here to avoid memory allocations within search).
- */
-template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
-                   const search_params& params,
-                   const index<IdxT>& index,
-                   const T* queries,
-                   uint32_t n_queries,
-                   uint32_t k,
-                   IdxT* neighbors,
-                   float* distances,
-                   rmm::mr::device_memory_resource* mr = nullptr)
-{
-  return raft::spatial::knn::ivf_pq::detail::search(
-    handle, params, index, queries, n_queries, k, neighbors, distances, mr);
-}
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
+
+#include <raft/neighbors/ivf_pq.cuh>
+
+namespace raft::spatial::knn::ivf_pq {
+
+using raft::neighbors::ivf_pq::build;
+using raft::neighbors::ivf_pq::extend;
+using raft::neighbors::ivf_pq::search;
 
 }  // namespace raft::spatial::knn::ivf_pq
diff --git a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
index b0b8b8d45f..83fb78eb46 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,422 +13,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#pragma once
-
-#include "common.hpp"
-
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/error.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/util/integer_utils.hpp>
-
-#include <type_traits>
-
-namespace raft::spatial::knn::ivf_pq {
-
-/** A type for specifying how PQ codebooks are created. */
-enum class codebook_gen {  // NOLINT
-  PER_SUBSPACE = 0,        // NOLINT
-  PER_CLUSTER  = 1,        // NOLINT
-};
-
-struct index_params : knn::index_params {
-  /**
-   * The number of inverted lists (clusters)
-   *
-   * Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
-   * 10,000.
-   */
-  uint32_t n_lists = 1024;
-  /** The number of iterations searching for kmeans centers (index building). */
-  uint32_t kmeans_n_iters = 20;
-  /** The fraction of data to use during iterative kmeans building. */
-  double kmeans_trainset_fraction = 0.5;
-  /**
-   * The bit length of the vector element after compression by PQ.
-   *
-   * Possible values: [4, 5, 6, 7, 8].
-   *
-   * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
-   * performance, but the lower the recall.
-   */
-  uint32_t pq_bits = 8;
-  /**
-   * The dimensionality of the vector after compression by PQ. When zero, an optimal value is
-   * selected using a heuristic.
-   *
-   * NB: `pq_dim * pq_bits` must be a multiple of 8.
-   *
-   * Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
-   * lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
-   * desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
-   * For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
-   * should be also a divisor of the dataset dim.
-   */
-  uint32_t pq_dim = 0;
-  /** How PQ codebooks are created. */
-  codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
-  /**
-   * Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
-   *
-   * Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
-   * data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
-   * larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
-   * However, this transform is not necessary when `dim` is multiple of `pq_dim`
-   *   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
-   *
-   * By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
-   * matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
-   * regardless of the values of `dim` and `pq_dim`.
-   */
-  bool force_random_rotation = false;
-};
-
-struct search_params : knn::search_params {
-  /** The number of clusters to search. */
-  uint32_t n_probes = 20;
-  /**
-   * Data type of look up table to be created dynamically at search time.
-   *
-   * Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
-   *
-   * The use of low-precision types reduces the amount of shared memory required at search time, so
-   * fast shared memory kernels can be used even for datasets with large dimansionality. Note that
-   * the recall is slightly degraded when low-precision type is selected.
-   */
-  cudaDataType_t lut_dtype = CUDA_R_32F;
-  /**
-   * Storage data type for distance/similarity computed at search time.
-   *
-   * Possible values: [CUDA_R_16F, CUDA_R_32F]
-   *
-   * If the performance limiter at search time is device memory access, selecting FP16 will improve
-   * performance slightly.
-   */
-  cudaDataType_t internal_distance_dtype = CUDA_R_32F;
-  /**
-   * Thread block size of the distance calculation kernel at search time.
-   * When zero, an optimal block size is selected using a heuristic.
-   *
-   * Possible values: [0, 256, 512, 1024]
-   */
-  uint32_t preferred_thread_block_size = 0;
-};
-
-static_assert(std::is_aggregate_v<index_params>);
-static_assert(std::is_aggregate_v<search_params>);
-
 /**
- * @brief IVF-PQ index.
- *
- * In the IVF-PQ index, a database vector y is approximated with two level quantization:
- *
- * y = Q_1(y) + Q_2(y - Q_1(y))
- *
- * The first level quantizer (Q_1), maps the vector y to the nearest cluster center. The number of
- * clusters is n_lists.
- *
- * The second quantizer encodes the residual, and it is defined as a product quantizer [1].
- *
- * A product quantizer encodes a `dim` dimensional vector with a `pq_dim` dimensional vector.
- * First we split the input vector into `pq_dim` subvectors (denoted by u), where each u vector
- * contains `pq_len` distinct components of y
- *
- * y_1, y_2, ... y_{pq_len}, y_{pq_len+1}, ... y_{2*pq_len}, ... y_{dim-pq_len+1} ... y_{dim}
- *  \___________________/     \____________________________/      \______________________/
- *         u_1                         u_2                          u_{pq_dim}
- *
- * Then each subvector encoded with a separate quantizer q_i, end the results are concatenated
- *
- * Q_2(y) = q_1(u_1),q_2(u_2),...,q_{pq_dim}(u_pq_dim})
- *
- * Each quantizer q_i outputs a code with pq_bit bits. The second level quantizers are also defined
- * by k-means clustering in the corresponding sub-space: the reproduction values are the centroids,
- * and the set of reproduction values is the codebook.
- *
- * When the data dimensionality `dim` is not multiple of `pq_dim`, the feature space is transformed
- * using a random orthogonal matrix to have `rot_dim = pq_dim * pq_len` dimensions
- * (`rot_dim >= dim`).
- *
- * The second-level quantizers are trained either for each subspace or for each cluster:
- *   (a) codebook_gen::PER_SUBSPACE:
- *         creates `pq_dim` second-level quantizers - one for each slice of the data along features;
- *   (b) codebook_gen::PER_CLUSTER:
- *         creates `n_lists` second-level quantizers - one for each first-level cluster.
- * In either case, the centroids are again found using k-means clustering interpreting the data as
- * having pq_len dimensions.
- *
- * [1] Product quantization for nearest neighbor search Herve Jegou, Matthijs Douze, Cordelia Schmid
- *
- * @tparam IdxT type of the indices in the source dataset
- *
+ * This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
  */
-template <typename IdxT>
-struct index : knn::index {
-  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
-                "IdxT must be able to represent all values of uint32_t");
-
- public:
-  /** Total length of the index. */
-  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
-  /** Dimensionality of the input data. */
-  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { return dim_; }
-  /**
-   * Dimensionality of the cluster centers:
-   * input data dim extended with vector norms and padded to 8 elems.
-   */
-  [[nodiscard]] constexpr inline auto dim_ext() const noexcept -> uint32_t
-  {
-    return raft::round_up_safe(dim() + 1, 8u);
-  }
-  /**
-   * Dimensionality of the data after transforming it for PQ processing
-   * (rotated and augmented to be muplitple of `pq_dim`).
-   */
-  [[nodiscard]] constexpr inline auto rot_dim() const noexcept -> uint32_t
-  {
-    return pq_len() * pq_dim();
-  }
-  /** The bit length of an encoded vector element after compression by PQ. */
-  [[nodiscard]] constexpr inline auto pq_bits() const noexcept -> uint32_t { return pq_bits_; }
-  /** The dimensionality of an encoded vector after compression by PQ. */
-  [[nodiscard]] constexpr inline auto pq_dim() const noexcept -> uint32_t { return pq_dim_; }
-  /** Dimensionality of a subspaces, i.e. the number of vector components mapped to a subspace */
-  [[nodiscard]] constexpr inline auto pq_len() const noexcept -> uint32_t
-  {
-    return raft::div_rounding_up_unsafe(dim(), pq_dim());
-  }
-  /** The number of vectors in a PQ codebook (`1 << pq_bits`). */
-  [[nodiscard]] constexpr inline auto pq_book_size() const noexcept -> uint32_t
-  {
-    return 1 << pq_bits();
-  }
-  /** Distance metric used for clustering. */
-  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
-  {
-    return metric_;
-  }
-  /** How PQ codebooks are created. */
-  [[nodiscard]] constexpr inline auto codebook_kind() const noexcept -> codebook_gen
-  {
-    return codebook_kind_;
-  }
-  /** Number of clusters/inverted lists (first level quantization). */
-  [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t { return n_lists_; }
-  /** Number of non-empty clusters/inverted lists. */
-  [[nodiscard]] constexpr inline auto n_nonempty_lists() const noexcept -> uint32_t
-  {
-    return n_nonempty_lists_;
-  }
-
-  // Don't allow copying the index for performance reasons (try avoiding copying data)
-  index(const index&) = delete;
-  index(index&&)      = default;
-  auto operator=(const index&) -> index& = delete;
-  auto operator=(index&&) -> index& = default;
-  ~index()                          = default;
-
-  /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
-        raft::distance::DistanceType metric,
-        codebook_gen codebook_kind,
-        uint32_t n_lists,
-        uint32_t dim,
-        uint32_t pq_bits          = 8,
-        uint32_t pq_dim           = 0,
-        uint32_t n_nonempty_lists = 0)
-    : knn::index(),
-      metric_(metric),
-      codebook_kind_(codebook_kind),
-      n_lists_(n_lists),
-      dim_(dim),
-      pq_bits_(pq_bits),
-      pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim),
-      n_nonempty_lists_(n_nonempty_lists),
-      pq_centers_{make_device_mdarray<float>(handle, make_pq_centers_extents())},
-      pq_dataset_{make_device_mdarray<uint8_t>(
-        handle, make_extents<IdxT>(0, this->pq_dim() * this->pq_bits() / 8))},
-      indices_{make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))},
-      rotation_matrix_{
-        make_device_mdarray<float>(handle, make_extents<uint32_t>(this->rot_dim(), this->dim()))},
-      list_offsets_{make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(this->n_lists() + 1))},
-      centers_{make_device_mdarray<float>(
-        handle, make_extents<uint32_t>(this->n_lists(), this->dim_ext()))},
-      centers_rot_{make_device_mdarray<float>(
-        handle, make_extents<uint32_t>(this->n_lists(), this->rot_dim()))}
-  {
-    check_consistency();
-  }
-
-  /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
-        const index_params& params,
-        uint32_t dim,
-        uint32_t n_nonempty_lists = 0)
-    : index(handle,
-            params.metric,
-            params.codebook_kind,
-            params.n_lists,
-            dim,
-            params.pq_bits,
-            params.pq_dim,
-            n_nonempty_lists)
-  {
-  }
-
-  /**
-   * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
-   * of data.
-   */
-  void allocate(const handle_t& handle, IdxT index_size)
-  {
-    pq_dataset_ =
-      make_device_mdarray<uint8_t>(handle, make_extents<IdxT>(index_size, pq_dataset_.extent(1)));
-    indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
-    check_consistency();
-  }
 
-  /**
-   * PQ cluster centers
-   *
-   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_book_size, pq_len]
-   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_book_size, pq_len]
-   */
-  inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
-  {
-    return pq_centers_.view();
-  }
-  [[nodiscard]] inline auto pq_centers() const noexcept
-    -> device_mdspan<const float, extent_3d<uint32_t>, row_major>
-  {
-    return pq_centers_.view();
-  }
-
-  /** PQ-encoded data [size, pq_dim * pq_bits / 8]. */
-  inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, extent_2d<IdxT>, row_major>
-  {
-    return pq_dataset_.view();
-  }
-  [[nodiscard]] inline auto pq_dataset() const noexcept
-    -> device_mdspan<const uint8_t, extent_2d<IdxT>, row_major>
-  {
-    return pq_dataset_.view();
-  }
-
-  /** Inverted list indices: ids of items in the source data [size] */
-  inline auto indices() noexcept -> device_mdspan<IdxT, extent_1d<IdxT>, row_major>
-  {
-    return indices_.view();
-  }
-  [[nodiscard]] inline auto indices() const noexcept
-    -> device_mdspan<const IdxT, extent_1d<IdxT>, row_major>
-  {
-    return indices_.view();
-  }
-
-  /** The transform matrix (original space -> rotated padded space) [rot_dim, dim] */
-  inline auto rotation_matrix() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
-  {
-    return rotation_matrix_.view();
-  }
-  [[nodiscard]] inline auto rotation_matrix() const noexcept
-    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
-  {
-    return rotation_matrix_.view();
-  }
-
-  /**
-   * Offsets into the lists [n_lists + 1].
-   * The last value contains the total length of the index.
-   */
-  inline auto list_offsets() noexcept -> device_mdspan<IdxT, extent_1d<uint32_t>, row_major>
-  {
-    return list_offsets_.view();
-  }
-  [[nodiscard]] inline auto list_offsets() const noexcept
-    -> device_mdspan<const IdxT, extent_1d<uint32_t>, row_major>
-  {
-    return list_offsets_.view();
-  }
-
-  /** Cluster centers corresponding to the lists in the original space [n_lists, dim_ext] */
-  inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
-  {
-    return centers_.view();
-  }
-  [[nodiscard]] inline auto centers() const noexcept
-    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
-  {
-    return centers_.view();
-  }
-
-  /** Cluster centers corresponding to the lists in the rotated space [n_lists, rot_dim] */
-  inline auto centers_rot() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
-  {
-    return centers_rot_.view();
-  }
-  [[nodiscard]] inline auto centers_rot() const noexcept
-    -> device_mdspan<const float, extent_2d<uint32_t>, row_major>
-  {
-    return centers_rot_.view();
-  }
+/**
+ * DISCLAIMER: this file is deprecated: use epsilon_neighborhood.cuh instead
+ */
 
- private:
-  raft::distance::DistanceType metric_;
-  codebook_gen codebook_kind_;
-  uint32_t n_lists_;
-  uint32_t dim_;
-  uint32_t pq_bits_;
-  uint32_t pq_dim_;
-  uint32_t n_nonempty_lists_;
+#pragma once
 
-  device_mdarray<float, extent_3d<uint32_t>, row_major> pq_centers_;
-  device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
-  device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
-  device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
-  device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
-  device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
-  device_mdarray<float, extent_2d<uint32_t>, row_major> centers_rot_;
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft::neighbors version instead.")
 
-  /** Throw an error if the index content is inconsistent. */
-  void check_consistency()
-  {
-    RAFT_EXPECTS(pq_bits() >= 4 && pq_bits() <= 8,
-                 "`pq_bits` must be within closed range [4,8], but got %u.",
-                 pq_bits());
-    RAFT_EXPECTS((pq_bits() * pq_dim()) % 8 == 0,
-                 "`pq_bits * pq_dim` must be a multiple of 8, but got %u * %u = %u.",
-                 pq_bits(),
-                 pq_dim(),
-                 pq_bits() * pq_dim());
-  }
+#include <raft/neighbors/ivf_pq_types.hpp>
 
-  auto make_pq_centers_extents() -> extent_3d<uint32_t>
-  {
-    switch (codebook_kind()) {
-      case codebook_gen::PER_SUBSPACE:
-        return make_extents<uint32_t>(pq_dim(), pq_book_size(), pq_len());
-      case codebook_gen::PER_CLUSTER:
-        return make_extents<uint32_t>(n_lists(), pq_book_size(), pq_len());
-      default: RAFT_FAIL("Unreachable code");
-    }
-  }
+namespace raft::spatial::knn::ivf_pq {
 
-  static inline auto calculate_pq_dim(uint32_t dim) -> uint32_t
-  {
-    // If the dimensionality is large enough, we can reduce it to improve performance
-    if (dim >= 128) { dim /= 2; }
-    // Round it down to 32 to improve performance.
-    uint32_t r = raft::round_down_safe<uint32_t>(dim, 32);
-    if (r > 0) return r;
-    // If the dimensionality is really low, round it to the closest power-of-two
-    r = 1;
-    while ((r << 1) <= dim) {
-      r = r << 1;
-    }
-    return r;
-  }
-};
+using raft::neighbors::ivf_pq::codebook_gen;
+using raft::neighbors::ivf_pq::index;
+using raft::neighbors::ivf_pq::index_params;
+using raft::neighbors::ivf_pq::search_params;
 
 }  // namespace raft::spatial::knn::ivf_pq
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
index b1f174e716..0511bbbf6c 100644
--- a/cpp/include/raft/spatial/knn/specializations.cuh
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -19,10 +19,10 @@
 
 #pragma once
 
-#include <raft/spatial/knn/specializations/ball_cover.cuh>
-#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
-#include <raft/spatial/knn/specializations/knn.cuh>
+#include <raft/neighbors/specializations/ball_cover.cuh>
+#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+#include <raft/neighbors/specializations/knn.cuh>
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #endif
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index e1b6a241c4..93fd07eb0b 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -31,8 +31,8 @@ namespace raft {
 namespace stats {
 
 /**
- * @brief Function to calculate Adjusted RandIndex as described
- *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
+ * @brief Function to calculate Adjusted RandIndex
+ * @see https://en.wikipedia.org/wiki/Rand_index
  * @tparam T data-type for input label arrays
  * @tparam MathT integral data-type used for computing n-choose-r
  * @param firstClusterArray: the array of classes
@@ -50,8 +50,8 @@ double adjusted_rand_index(const T* firstClusterArray,
 }
 
 /**
- * @brief Function to calculate Adjusted RandIndex as described
- *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
+ * @brief Function to calculate Adjusted RandIndex
+ * @see https://en.wikipedia.org/wiki/Rand_index
  * @tparam value_t data-type for input label arrays
  * @tparam math_t integral data-type used for computing n-choose-r
  * @tparam idx_t Index type of matrix extent.
diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp
index 8392bd50fe..724ca224c6 100644
--- a/cpp/include/raft/stats/common.hpp
+++ b/cpp/include/raft/stats/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,59 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/util/cudart_utils.hpp>
-
-// This file is a shameless amalgamation of independent works done by
-// Lars Nyland and Andy Adinets
-
-///@todo: add cub's histogram as another option
-
-namespace raft {
-namespace stats {
-
-/** Default mapper which just returns the value of the data itself */
-template <typename DataT, typename IdxT>
-struct IdentityBinner {
-  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
-};
-
-/** Types of support histogram implementations */
-enum HistType {
-  /** shared mem atomics but with bins to be 1b int's */
-  HistTypeSmemBits1 = 1,
-  /** shared mem atomics but with bins to be 2b int's */
-  HistTypeSmemBits2 = 2,
-  /** shared mem atomics but with bins to be 4b int's */
-  HistTypeSmemBits4 = 4,
-  /** shared mem atomics but with bins to ba 1B int's */
-  HistTypeSmemBits8 = 8,
-  /** shared mem atomics but with bins to be 2B int's */
-  HistTypeSmemBits16 = 16,
-  /** use only global atomics */
-  HistTypeGmem,
-  /** uses shared mem atomics to reduce global traffic */
-  HistTypeSmem,
-  /**
-   * uses shared mem atomics with match_any intrinsic to further reduce shared
-   * memory traffic. This can only be enabled on Volta and later architectures.
-   * If one tries to enable this for older arch's, it will fall back to
-   * `HistTypeSmem`.
-   * @note This is to be used only when the input dataset leads to a lot of
-   *       repetitions in a given warp, else, this algo can be much slower than
-   *       `HistTypeSmem`!
-   */
-  HistTypeSmemMatchAny,
-  /** builds a hashmap of active bins in shared mem */
-  HistTypeSmemHash,
-  /** decide at runtime the best algo for the given inputs */
-  HistTypeAuto
-};
-
-/// Supported types of information criteria
-enum IC_Type { AIC, AICc, BIC };
+#pragma message(__FILE__                                                  \
+                " is deprecated and will be removed in a future release." \
+                " Please use the raft/stats/stats_types.hpp version instead.")
 
-};  // end namespace stats
-};  // end namespace raft
+#include <raft/stats/stats_types.hpp>
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
index 777e0b7816..69bd721ded 100644
--- a/cpp/include/raft/stats/detail/histogram.cuh
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -32,6 +32,12 @@ namespace raft {
 namespace stats {
 namespace detail {
 
+/** Default mapper which just returns the value of the data itself */
+template <typename DataT, typename IdxT>
+struct IdentityBinner {
+  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
+};
+
 static const int ThreadsPerBlock = 256;
 
 template <typename IdxT, int VecLen>
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index df1c2772f1..8efb2e8df8 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -31,6 +31,14 @@
 namespace raft {
 namespace stats {
 
+/**
+ * Default mapper which just returns the value of the data itself
+ */
+template <typename DataT, typename IdxT>
+struct IdentityBinner : public detail::IdentityBinner<DataT, IdxT> {
+  IdentityBinner() : detail::IdentityBinner<DataT, IdxT>() {}
+};
+
 /**
  * @brief Perform histogram on the input data. It chooses the right load size
  * based on the input data vector length. It also supports large-bin cases
diff --git a/cpp/include/raft/stats/stats_types.hpp b/cpp/include/raft/stats/stats_types.hpp
new file mode 100644
index 0000000000..5db5ef1c57
--- /dev/null
+++ b/cpp/include/raft/stats/stats_types.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::stats {
+
+/**
+ * @brief Types of support histogram implementations
+ */
+enum HistType {
+  /** shared mem atomics but with bins to be 1b int's */
+  HistTypeSmemBits1 = 1,
+  /** shared mem atomics but with bins to be 2b int's */
+  HistTypeSmemBits2 = 2,
+  /** shared mem atomics but with bins to be 4b int's */
+  HistTypeSmemBits4 = 4,
+  /** shared mem atomics but with bins to ba 1B int's */
+  HistTypeSmemBits8 = 8,
+  /** shared mem atomics but with bins to be 2B int's */
+  HistTypeSmemBits16 = 16,
+  /** use only global atomics */
+  HistTypeGmem,
+  /** uses shared mem atomics to reduce global traffic */
+  HistTypeSmem,
+  /**
+   * uses shared mem atomics with match_any intrinsic to further reduce shared
+   * memory traffic. This can only be enabled on Volta and later architectures.
+   * If one tries to enable this for older arch's, it will fall back to
+   * `HistTypeSmem`.
+   * @note This is to be used only when the input dataset leads to a lot of
+   *       repetitions in a given warp, else, this algo can be much slower than
+   *       `HistTypeSmem`!
+   */
+  HistTypeSmemMatchAny,
+  /** builds a hashmap of active bins in shared mem */
+  HistTypeSmemHash,
+  /** decide at runtime the best algo for the given inputs */
+  HistTypeAuto
+};
+
+/**
+ * @brief Supported types of information criteria
+ */
+enum IC_Type { AIC, AICc, BIC };
+
+};  // end namespace raft::stats
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 0e8338fe84..65d1b2c35f 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/stats/detail/weighted_mean.cuh>
 
 namespace raft {
@@ -91,6 +92,90 @@ void colWeightedMean(
 {
   weightedMean(mu, data, weights, D, N, true, false, stream);
 }
+
+/**
+ * @brief Compute the weighted mean of the input matrix with a
+ * vector of weights, along rows or along columns
+ *
+ * @tparam value_t the data type
+ * @tparam idx_t Integer type used to for addressing
+ * @tparam layout_t Layout type of the input matrix.
+ * @param[in]  handle the raft handle
+ * @param[in]  data the input matrix of size nrows * ncols
+ * @param[in]  weights weight of size ncols if along_row is true, else of size nrows
+ * @param[out] mu the output mean vector of size nrows if along_row is true, else of size ncols
+ * @param[in]  along_rows whether to reduce along rows or columns
+ */
+template <typename value_t, typename idx_t, typename layout_t>
+void weighted_mean(const raft::handle_t& handle,
+                   raft::device_matrix_view<const value_t, idx_t, layout_t> data,
+                   raft::device_vector_view<const value_t, idx_t> weights,
+                   raft::device_vector_view<value_t, idx_t> mu,
+                   bool along_rows)
+{
+  constexpr bool is_row_major = std::is_same_v<layout_t, raft::row_major>;
+  constexpr bool is_col_major = std::is_same_v<layout_t, raft::col_major>;
+  static_assert(is_row_major || is_col_major,
+                "weighted_mean: Layout must be either "
+                "raft::row_major or raft::col_major (or one of their aliases)");
+  auto mean_vec_size = along_rows ? data.extent(0) : data.extent(1);
+  auto weight_size   = along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(weights.extent(0) == weight_size,
+               "Size mismatch between weights and expected weight_size");
+  RAFT_EXPECTS(mu.extent(0) == mean_vec_size, "Size mismatch betwen mu and expected mean_vec_size");
+
+  detail::weightedMean(mu.data_handle(),
+                       data.data_handle(),
+                       weights.data_handle(),
+                       data.extent(1),
+                       data.extent(0),
+                       is_row_major,
+                       along_rows,
+                       handle.get_stream());
+}
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix with a
+ * vector of column weights
+ *
+ * @tparam value_t the data type
+ * @tparam idx_t Integer type used to for addressing
+ * @tparam layout_t Layout type of the input matrix.
+ * @param[in]  handle the raft handle
+ * @param[in]  data the input matrix of size nrows * ncols
+ * @param[in]  weights weight vector of size ncols
+ * @param[out] mu the output mean vector of size nrows
+ */
+template <typename value_t, typename idx_t, typename layout_t>
+void row_weighted_mean(const raft::handle_t& handle,
+                       raft::device_matrix_view<const value_t, idx_t, layout_t> data,
+                       raft::device_vector_view<const value_t, idx_t> weights,
+                       raft::device_vector_view<value_t, idx_t> mu)
+{
+  weighted_mean(handle, data, weights, mu, true);
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix with a
+ * vector of row weights
+ *
+ * @tparam value_t the data type
+ * @tparam idx_t Integer type used to for addressing
+ * @tparam layout_t Layout type of the input matrix.
+ * @param[in]  handle the raft handle
+ * @param[in]  data the input matrix of size nrows * ncols
+ * @param[in]  weights weight vector of size nrows
+ * @param[out] mu the output mean vector of size ncols
+ */
+template <typename value_t, typename idx_t, typename layout_t>
+void col_weighted_mean(const raft::handle_t& handle,
+                       raft::device_matrix_view<const value_t, idx_t, layout_t> data,
+                       raft::device_vector_view<const value_t, idx_t> weights,
+                       raft::device_vector_view<value_t, idx_t> mu)
+{
+  weighted_mean(handle, data, weights, mu, false);
+}
 };  // end namespace stats
 };  // end namespace raft
 
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
new file mode 100644
index 0000000000..67356785c0
--- /dev/null
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+// NOTE: This code is prematurely taken from an example based on 
+// https://github.com/kokkos/mdspan/pull/176
+
+#pragma once
+
+#include "macros.hpp"
+#include "trait_backports.hpp"
+#include "default_accessor.hpp"
+#include "extents.hpp"
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+namespace std {
+namespace experimental {
+
+namespace stdex = std::experimental;
+
+
+// Prefer std::assume_aligned if available, as it is in the C++ Standard.
+// Otherwise, use a compiler-specific equivalent if available.
+
+// NOTE (mfh 2022/08/08) BYTE_ALIGNMENT must be unsigned and a power of 2.
+#if defined(__cpp_lib_assume_aligned)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) (std::assume_aligned< BYTE_ALIGNMENT >( POINTER ))
+  constexpr char assume_aligned_method[] = "std::assume_aligned";
+#elif defined(__ICL)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#elif defined(__ICC)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#elif defined(__clang__)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#elif defined(__GNUC__)
+  // __builtin_assume_aligned returns void*
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) reinterpret_cast< ELEMENT_TYPE* >(__builtin_assume_aligned( POINTER, BYTE_ALIGNMENT ))
+  constexpr char assume_aligned_method[] = "__builtin_assume_aligned";
+#else
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#endif
+
+// Some compilers other than Clang or GCC like to define __clang__ or __GNUC__.
+// Thus, we order the tests from most to least specific.
+#if defined(__ICL)
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __declspec(align_value( BYTE_ALIGNMENT ));
+  constexpr char align_attribute_method[] = "__declspec(align_value(BYTE_ALIGNMENT))";
+#elif defined(__ICC)
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __attribute__((align_value( BYTE_ALIGNMENT )));
+  constexpr char align_attribute_method[] = "__attribute__((align_value(BYTE_ALIGNMENT)))";
+#elif defined(__clang__)
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __attribute__((align_value( BYTE_ALIGNMENT )));
+  constexpr char align_attribute_method[] = "__attribute__((align_value(BYTE_ALIGNMENT)))";
+#else
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT )
+  constexpr char align_attribute_method[] = "(none)";
+#endif
+
+constexpr bool
+is_nonzero_power_of_two(const std::size_t x)
+{
+// Just checking __cpp_lib_int_pow2 isn't enough for some GCC versions.
+// The <bit> header exists, but std::has_single_bit does not.
+#if defined(__cpp_lib_int_pow2) && __cplusplus >= 202002L
+  return std::has_single_bit(x);
+#else
+  return x != 0 && (x & (x - 1)) == 0;
+#endif
+}
+
+template<class ElementType>
+constexpr bool
+valid_byte_alignment(const std::size_t byte_alignment)
+{
+  return is_nonzero_power_of_two(byte_alignment) && byte_alignment >= alignof(ElementType);
+}
+
+// We define aligned_pointer_t through a struct
+// so we can check whether the byte alignment is valid.
+// This makes it impossible to use the alias
+// with an invalid byte alignment.
+template<class ElementType, std::size_t byte_alignment>
+struct aligned_pointer {
+  static_assert(valid_byte_alignment<ElementType>(byte_alignment),
+		"byte_alignment must be a power of two no less than "
+		"the minimum required alignment of ElementType.");
+  using type = ElementType* _MDSPAN_ALIGN_VALUE_ATTRIBUTE( byte_alignment );
+};
+
+
+template<class ElementType, std::size_t byte_alignment>
+using aligned_pointer_t = typename aligned_pointer<ElementType, byte_alignment>::type;
+
+template<class ElementType, std::size_t byte_alignment>
+aligned_pointer_t<ElementType, byte_alignment>
+bless(ElementType* ptr, std::integral_constant<std::size_t, byte_alignment> /* ba */ )
+{
+  return _MDSPAN_ASSUME_ALIGNED( ElementType, ptr, byte_alignment );
+}
+
+
+template<class ElementType, std::size_t byte_alignment>
+struct aligned_accessor {
+  using offset_policy = stdex::default_accessor<ElementType>;
+  using element_type = ElementType;
+  using reference = ElementType&;
+  using data_handle_type = aligned_pointer_t<ElementType, byte_alignment>;
+
+  constexpr aligned_accessor() noexcept = default;
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class OtherElementType,
+    std::size_t other_byte_alignment,
+    /* requires */ (std::is_convertible<OtherElementType(*)[], element_type(*)[]>::value && other_byte_alignment == byte_alignment)
+    )
+  constexpr aligned_accessor(aligned_accessor<OtherElementType, other_byte_alignment>) noexcept {}
+
+  constexpr reference access(data_handle_type p, size_t i) const noexcept {
+    // This may declare alignment twice, depending on
+    // if we have an attribute for marking pointer types.
+    return _MDSPAN_ASSUME_ALIGNED( ElementType, p, byte_alignment )[i];
+  }
+
+  constexpr typename offset_policy::data_handle_type
+  offset(data_handle_type p, size_t i) const noexcept {
+    return p + i;
+  }
+};
+
+template<class ElementType>
+struct delete_raw {
+  void operator()(ElementType* p) const {
+    if (p != nullptr) {
+      // All the aligned allocation methods below go with std::free.
+      // If we implement a new method that uses a different
+      // deallocation function, that function would go here.
+      std::free(p);
+    }
+  }
+};
+
+}  // end namespace experimental
+}  // end namespace std
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
new file mode 100644
index 0000000000..cd9c9c19bf
--- /dev/null
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
@@ -0,0 +1,787 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+// NOTE: This code is prematurely taken from https://github.com/kokkos/mdspan/pull/180
+// and matches requirements described in https://github.com/ORNL/cpp-proposals-pub/pull/296
+// Some parts (as submdspan integration) are missing
+// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a 
+// fixed stride to a padding alignment, allowing dimensions > padding_stride to be padded
+// to multiples of 'padding_stride'
+
+#pragma once
+
+#include "macros.hpp"
+#include "trait_backports.hpp"
+#include "extents.hpp"
+#include "layout_left.hpp"
+#include "layout_right.hpp"
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+namespace std {
+namespace experimental {
+
+namespace stdex = std::experimental;
+
+namespace details {
+
+  // offset_index_sequence idea comes from "offset_sequence" here:
+  // https://devblogs.microsoft.com/oldnewthing/20200625-00/?p=103903
+  //
+  // offset_index_sequence adds N to each element of the given IndexSequence.
+  // We can't just template on the parameter pack of indices directly;
+  // the pack needs to be contained in some type.
+  // We choose index_sequence because it stores no run-time data.
+  template<std::size_t N, class IndexSequence> struct offset_index_sequence;
+
+  template<std::size_t N, std::size_t... Indices>
+  struct offset_index_sequence<N, std::index_sequence<Indices...>>
+  {
+    using type = std::index_sequence<(Indices + N)...>;
+  };
+
+  template<std::size_t N, typename IndexSequence>
+  using offset_index_sequence_t = typename offset_index_sequence<N, IndexSequence>::type;
+
+  static_assert(std::is_same<
+    offset_index_sequence_t<3, std::make_index_sequence<4>>,
+    std::index_sequence<3, 4, 5, 6>>::value,
+    "offset_index_sequence defined incorrectly." );
+
+  // iota_index_sequence defines the half-open sequence
+  // begin, begin+1, begin+2, ..., end-1.
+  // If end == begin, then the sequence is empty (we permit this).
+  //
+  // Defining the struct first, rather than going straight to the type alias,
+  // lets us check the template arguments.
+  template<std::size_t begin, std::size_t end>
+  struct iota_index_sequence {
+    static_assert(end >= begin, "end must be >= begin.");
+    using type =
+      offset_index_sequence_t< begin, std::make_index_sequence<end - begin> >;
+  };
+
+  // iota_index_sequence_t is like make_index_sequence,
+  // except that it starts with begin instead of 0.
+  template<std::size_t begin, std::size_t end>
+  using iota_index_sequence_t = typename iota_index_sequence<begin, end>::type;
+
+  static_assert(std::is_same<
+		iota_index_sequence_t<3, 6>,
+		std::index_sequence<3, 4, 5>>::value,
+		"iota_index_sequence defined incorrectly." );
+
+  static_assert(std::is_same<
+		iota_index_sequence_t<3, 3>,
+		std::index_sequence<>>::value,
+		"iota_index_sequence defined incorrectly." );
+
+  static_assert(std::is_same<
+		iota_index_sequence_t<3, 4>,
+		std::index_sequence<3>>::value,
+		"iota_index_sequence defined incorrectly." );
+
+  template <typename IndexType>
+  constexpr IndexType ceildiv(IndexType a, IndexType b)
+  {
+    return (a + b - 1) / b;
+  }
+
+  template <typename IndexType>
+  constexpr IndexType alignTo(IndexType a, IndexType b)
+  {
+    return ceildiv(a, b) * b;
+  }
+
+} // namespace details
+
+// layout_padded_left implementation
+
+namespace details {
+   
+
+  // The *_helper functions work around not having C++20
+  // templated lambdas: []<size_t... TrailingIndices>{} .
+
+  // The third argument should always be
+  // iota_index_sequence_t<1, ReturnExtents::rank()>.
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class InnerExtents,
+	   std::size_t... TrailingIndices>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_left_extents_helper(const stdex::extents<typename InnerExtents::index_type, UnpaddedExtent>& unpadded_extent,
+			     const InnerExtents& inner_extents,
+			     std::index_sequence<TrailingIndices...>)
+  {
+    static_assert(sizeof...(TrailingIndices) + 1 == ReturnExtents::rank(),
+		  "sizeof...(TrailingIndices) + 1 != ReturnExtents::rank()");
+    static_assert(InnerExtents::rank() == ReturnExtents::rank(),
+		  "InnerExtents::rank() != ReturnExtents::rank()");
+    using index_type = typename ReturnExtents::index_type;
+    return ReturnExtents{
+      unpadded_extent.extent(0),
+      index_type(inner_extents.extent(TrailingIndices))...
+    };
+  }
+
+  // The third argument should always be
+  // iota_index_sequence_t<0, ReturnExtents::rank() - 1>.
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class InnerExtents,
+	   std::size_t... LeadingIndices>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_right_extents_helper(const InnerExtents& inner_extents,
+			      const stdex::extents<typename InnerExtents::index_type, UnpaddedExtent>& unpadded_extent,
+			      std::index_sequence<LeadingIndices...>)
+  {
+    static_assert(sizeof...(LeadingIndices) + 1 == ReturnExtents::rank(),
+		  "sizeof...(LeadingIndices) + 1 != ReturnExtents::rank()");
+    static_assert(InnerExtents::rank() == ReturnExtents::rank(),
+		  "InnerExtents::rank() != ReturnExtents::rank()");
+    using index_type = typename ReturnExtents::index_type;
+    return ReturnExtents{
+      index_type(inner_extents.extent(LeadingIndices))...,
+      unpadded_extent.extent(0)
+    };
+  }
+
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_left_extents(const stdex::extents<IndexType, UnpaddedExtent>& unpadded_extent,
+		      const stdex::extents<IndexType, InnerExtents...>& inner_extents)
+  {
+    return layout_left_extents_helper<ReturnExtents>(
+      unpadded_extent,
+      inner_extents,
+      details::iota_index_sequence_t<1, ReturnExtents::rank()>{}
+    );
+  }
+
+  // Rank-0 unpadded_extent means rank-0 input,
+  // but the latter turns out not to matter here.
+
+  template<class ReturnExtents,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_left_extents(const stdex::extents<IndexType>& /* unpadded_extent */ ,
+		      const stdex::extents<IndexType, InnerExtents...>& inner_extents)
+  {
+    return inner_extents;
+  }
+
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_right_extents(const stdex::extents<IndexType, InnerExtents...>& inner_extents,
+		       const stdex::extents<IndexType, UnpaddedExtent>& unpadded_extent)
+  {
+    // If rank() is zero, size_t(-1) would be a very large upper bound.
+    static_assert(ReturnExtents::rank() != 0,
+		  "ReturnExtents::rank() must not be 0");
+    return layout_right_extents_helper<ReturnExtents>(
+       inner_extents,
+       unpadded_extent,
+       details::iota_index_sequence_t<0, ReturnExtents::rank() - 1>{}
+    );
+  }
+
+  // Rank-0 unpadded_extent means rank-0 input,
+  // but the latter turns out not to matter here.
+
+  template<class ReturnExtents,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_right_extents(const stdex::extents<IndexType, InnerExtents...>& inner_extents,
+		       const stdex::extents<IndexType>& /* unpadded_extent */ )
+  {
+    return inner_extents;
+  }
+
+  template<
+    class InputExtentsType,
+    std::size_t PaddingExtent,
+    std::size_t ... Indices
+  >
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_left_helper(const InputExtentsType& input,
+			  const stdex::extents<typename InputExtentsType::index_type, PaddingExtent>& padding,
+			  std::index_sequence<Indices...>)
+  {
+    // NOTE (mfh 2022/09/04) This can be if constexpr,
+    // if the compiler supports it.
+    if /* constexpr */ (PaddingExtent == stdex::dynamic_extent) {
+      assert(padding.extent(0) != stdex::dynamic_extent);
+    }
+    using input_type = std::remove_cv_t<std::remove_reference_t<InputExtentsType>>;
+    using index_type = typename input_type::index_type;
+    constexpr std::size_t rank = input_type::rank();
+    static_assert(sizeof...(Indices) == std::size_t(rank - 1),
+		  "Indices pack has the wrong size.");
+    using return_type = stdex::extents<
+      index_type,
+      stdex::dynamic_extent,
+      input_type::static_extent(Indices)...
+    >;
+    return return_type{
+      index_type(details::alignTo(input.extent(0), padding.extent(0))),
+      input.extent(Indices)...
+    };
+  }
+
+  template<
+    class InputExtentsType,
+    std::size_t PaddingExtent,
+    std::size_t ... Indices
+  >
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_right_helper(const InputExtentsType& input,
+			   const stdex::extents<typename InputExtentsType::index_type, PaddingExtent>& padding,
+			   std::index_sequence<Indices...>)
+  {
+    // NOTE (mfh 2022/09/04) This can be if constexpr,
+    // if the compiler supports it.
+    if /* constexpr */ (PaddingExtent == stdex::dynamic_extent) {
+      assert(padding.extent(0) != stdex::dynamic_extent);
+    }
+    using input_type = std::remove_cv_t<std::remove_reference_t<InputExtentsType>>;
+    using index_type = typename input_type::index_type;
+    constexpr std::size_t rank = input_type::rank();
+    static_assert(sizeof...(Indices) == std::size_t(rank - 1),
+		  "Indices pack has the wrong size.");
+
+    using return_type = stdex::extents<
+      index_type,
+      input_type::static_extent(Indices)...,
+      stdex::dynamic_extent
+    >;
+    return return_type{
+      input.extent(Indices)...,
+      index_type(details::alignTo(input.extent(rank - 1), padding.extent(0)))
+    };
+  }
+
+  // Rank-0 and rank-1 mdspan don't need extra padding from their layout.
+  // They rely on an "aligned_accessor" and on the data_handle's alignment.
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) <= std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_left(const stdex::extents<IndexType, InputExtents...>& input,
+		   const stdex::extents<IndexType, PaddingExtent> /* padding */ )
+  {
+    return input;
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) <= std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_right(const stdex::extents<IndexType, InputExtents...>& input,
+		    const stdex::extents<IndexType, PaddingExtent> /* padding */ )
+  {
+    return input;
+  }
+
+  // rank > 1 case follows.
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) > std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_left(const stdex::extents<IndexType, InputExtents...>& input,
+		   const stdex::extents<IndexType, PaddingExtent> padding)
+  {
+    constexpr std::size_t rank = sizeof...(InputExtents);
+    return details::pad_extents_left_helper
+      (input, padding, details::iota_index_sequence_t<1, rank>{});
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) > std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_right(const stdex::extents<IndexType, InputExtents...>& input,
+		    const stdex::extents<IndexType, PaddingExtent> padding)
+  {
+    constexpr std::size_t rank = sizeof...(InputExtents);
+    return details::pad_extents_right_helper
+      (input, padding, details::iota_index_sequence_t<0, rank - 1>{});
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) != std::size_t(0))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_left(const stdex::extents<IndexType, InputExtents...>& input)
+  {
+    using input_type = stdex::extents<IndexType, InputExtents...>;
+    return stdex::extents<IndexType, input_type::static_extent(0)>{input.extent(0)};
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) != std::size_t(0))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_right(const stdex::extents<IndexType, InputExtents...>& input)
+  {
+    using input_type = stdex::extents<IndexType, InputExtents...>;
+    const auto rank = input_type::rank();
+    return stdex::extents<IndexType, input_type::static_extent(rank - 1)>{input.extent(rank - 1)};
+  }
+
+  template<class IndexType>
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_left(const stdex::extents<IndexType>& /* input */ )
+  {
+    return stdex::extents<IndexType>{};
+  }
+
+  template<class IndexType>
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_right(const stdex::extents<IndexType>& /* input */ )
+  {
+    return stdex::extents<IndexType>{};
+  }
+
+  // Helper functions to work around C++14's lack of "if constexpr."
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType,
+	   std::size_t Rank>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  left_padding_extents(const InnerMappingType& inner_mapping,
+		       std::integral_constant<std::size_t, Rank> /* rank */ )
+  {
+    return PaddingExtentsType{inner_mapping.extent(0)};
+  }
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  left_padding_extents(const InnerMappingType& /* inner_mapping */ ,
+		       std::integral_constant<std::size_t, 0> /* rank */ )
+  {
+    return PaddingExtentsType{};
+  }
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType,
+           std::size_t Rank>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  right_padding_extents(const InnerMappingType& inner_mapping,
+			std::integral_constant<std::size_t, Rank> /* rank */ )
+  {
+    return PaddingExtentsType{inner_mapping.extent(Rank - 1)};
+  }
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  right_padding_extents(const InnerMappingType& /* inner_mapping */ ,
+		       std::integral_constant<std::size_t, 0> /* rank */ )
+  {
+    return PaddingExtentsType{};
+  }
+
+} // namespace details
+
+
+
+// TODO (mfh 2022/08/30) Private inheritance from layout_left::mapping
+// resp. layout_right::mapping would reduce inlining depth.
+
+// layout_left_padded is like layout_left,
+// except that stride(0) == 1 always,
+// and the leftmost extent may be padded
+// (so that stride(1) could possibly be greater than extent(0)).
+//
+// This layout exists for two reasons:
+//
+// 1. Appropriate choice of padding, plus use of overaligned memory,
+//    can ensure any desired power-of-two overalignment of the
+//    beginning of each contiguous segment of elements in an mdspan.
+//    This is useful for hardware that optimizes for overaligned
+//    access.
+//
+// 2. For rank-2 mdspan, this is exactly the layout supported by the
+//    BLAS and LAPACK (where the "leading dimension" of the matrix
+//    (LDA), i.e., the stride, is greater than or equal to the number
+//    of rows).
+//
+// The padding can be either a compile-time value or a run-time value.
+// It is a template parameter of layout_left_padded (the "tag type"),
+// and NOT of the mapping, because mdspan requires that the mapping be
+// a metafunction of the tag type and the extents specialization type.
+template<std::size_t padding_stride = stdex::dynamic_extent>
+struct layout_left_padded {
+  static constexpr size_t padding = padding_stride;
+  template <class Extents>
+  class mapping {
+  public:
+    using extents_type = Extents;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using layout_type = layout_left_padded<padding_stride>;
+
+  private:
+    using padding_extents_type =
+      stdex::extents<index_type, padding_stride>;
+    using inner_layout_type = stdex::layout_left;
+    using inner_extents_type = decltype(
+      details::pad_extents_left(
+	std::declval<Extents>(),
+	std::declval<padding_extents_type>()
+      )
+    );
+    using inner_mapping_type =
+      typename inner_layout_type::template mapping<inner_extents_type>;
+    using unpadded_extent_type =
+      decltype(details::unpadded_extent_left(std::declval<extents_type>()));
+
+    inner_mapping_type inner_mapping_;
+    unpadded_extent_type unpadded_extent_;
+
+    padding_extents_type padding_extents() const {
+      return details::left_padding_extents<padding_extents_type>(
+        inner_mapping_,
+	std::integral_constant<std::size_t, extents_type::rank()>{});
+    }
+
+  public:
+    // mapping constructor that takes ONLY an extents_type.
+    //
+    // This constructor makes it possible to construct an mdspan
+    // from a pointer and extents, since that requires that
+    // the mapping be constructible from extents alone.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext) :
+      inner_mapping_(details::pad_extents_left(
+        ext,
+	padding_extents_type{padding_stride})),
+      unpadded_extent_(details::unpadded_extent_left(ext))
+    {}
+
+    // mapping constructor that takes an extents_type,
+    // AND an integral padding_value.
+    //
+    // This constructor always exists, even if padding is known at
+    // compile time -- just like the extents constructor lets you pass
+    // in all rank() extents, even if some of them are known at
+    // compile time.
+    template<class Size>
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    Size padding_value,
+	    std::enable_if_t<
+	      std::is_convertible<Size, index_type>::value &&
+              std::is_nothrow_constructible<index_type, Size>::value
+	    >* = nullptr) :
+      inner_mapping_(details::pad_extents_left(ext, padding_extents_type{padding_value})),
+      unpadded_extent_(details::unpadded_extent_left(ext))
+    {
+      // We don't have to check padding_value here, because the
+      // padding_extents_type constructor already has a precondition.
+    }
+
+    // Pass in the padding as an extents object.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    const stdex::extents<index_type, padding_stride>& padding_extents) :
+      inner_mapping_(details::pad_extents_left(ext, padding_extents)),
+      unpadded_extent_(details::unpadded_extent_left(ext))
+    {}
+
+    // FIXME (mfh 2022/09/28) Converting constructor taking
+    // layout_right_padded<other_padding_stride>::mapping<OtherExtents>
+    // is in the proposal, but missing here.
+
+    // layout_stride::mapping deliberately only defines the copy
+    // constructor and copy assignment operator, not the move
+    // constructor or move assignment operator.  This is fine because
+    // all the storage is std::array-like; there's no advantage to
+    // move construction or move assignment.  We imitate this.
+    MDSPAN_INLINE_FUNCTION_DEFAULTED
+    constexpr mapping(const mapping&) noexcept = default;
+    MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED
+    mapping& operator=(const mapping&) noexcept = default;
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr extents_type extents() const noexcept
+    {
+      return details::layout_left_extents<extents_type>(
+	unpadded_extent_,
+	inner_mapping_.extents()
+      );
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr std::array<index_type, extents_type::rank()>
+    strides() const noexcept
+    {
+      return inner_mapping_.strides();
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type required_span_size() const noexcept
+    {
+      return inner_mapping_.required_span_size();
+    }
+
+    MDSPAN_TEMPLATE_REQUIRES(
+      class... Indices,
+      /* requires */ (sizeof...(Indices) == Extents::rank() &&
+        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) &&
+	_MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/)
+      )
+    )
+    MDSPAN_INLINE_FUNCTION
+    constexpr size_t operator()(Indices... idxs) const noexcept {
+      // TODO (mfh 2022/08/30) in debug mode, check precondition before forwarding to inner mapping.
+      return inner_mapping_(std::forward<Indices>(idxs)...);
+    }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept {
+      return extents_type::rank() == 0 ? true :
+	(extents_type::static_extent(0) != stdex::dynamic_extent &&
+	 extents_type::static_extent(0) == unpadded_extent_type::static_extent(0));
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept {
+      return extents_type::rank() == 0 ? true :
+	inner_mapping_.extent(0) == unpadded_extent_.extent(0);
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type stride(rank_type r) const noexcept {
+      return inner_mapping_.stride(r);
+    }
+  };
+};
+
+template<std::size_t padding_stride = stdex::dynamic_extent>
+struct layout_right_padded {
+  static constexpr size_t padding = padding_stride;
+  template <class Extents>
+  class mapping {
+  public:
+    using extents_type = Extents;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using layout_type = layout_right_padded<padding_stride>;
+
+  private:
+    using padding_extents_type =
+      stdex::extents<index_type, padding_stride>;
+    using inner_layout_type = stdex::layout_right;
+    using inner_extents_type = decltype(
+      details::pad_extents_right(
+        std::declval<Extents>(),
+        std::declval<padding_extents_type>()
+      )
+    );
+    using inner_mapping_type =
+      typename inner_layout_type::template mapping<inner_extents_type>;
+    using unpadded_extent_type =
+      decltype(details::unpadded_extent_right(std::declval<extents_type>()));
+
+    inner_mapping_type inner_mapping_;
+    unpadded_extent_type unpadded_extent_;
+
+    padding_extents_type padding_extents() const {
+      return details::right_padding_extents<padding_extents_type>(
+        inner_mapping_,
+	std::integral_constant<std::size_t, extents_type::rank()>{});
+    }
+
+  public:
+    // mapping constructor that takes ONLY an extents_type.
+    //
+    // This constructor makes it possible to construct an mdspan
+    // from a pointer and extents, since that requires that
+    // the mapping be constructible from extents alone.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext) :
+      inner_mapping_(details::pad_extents_right(
+        ext,
+	padding_extents_type{padding_stride})),
+      unpadded_extent_(details::unpadded_extent_right(ext))
+    {}
+
+    // mapping constructor that takes an extents_type,
+    // AND an integral padding_value.
+    //
+    // This constructor always exists, even if padding is known at
+    // compile time -- just like the extents constructor lets you pass
+    // in all rank() extents, even if some of them are known at
+    // compile time.
+    template<class Size>
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    Size padding_value,
+	    std::enable_if_t<
+	      std::is_convertible<Size, index_type>::value &&
+	      std::is_nothrow_constructible<index_type, Size>::value
+	    >* = nullptr) :
+      inner_mapping_(details::pad_extents_right(ext, padding_extents_type{padding_value})),
+      unpadded_extent_(details::unpadded_extent_right(ext))
+    {
+      // We don't have to check padding_value here, because the
+      // padding_extents_type constructor already has a precondition.
+    }
+
+    // Pass in the padding as an extents object.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    const stdex::extents<index_type, padding_stride>& padding_extents) :
+      inner_mapping_(details::pad_extents_right(ext, padding_extents)),
+      unpadded_extent_(details::unpadded_extent_right(ext))
+    {}
+
+    // FIXME (mfh 2022/09/28) The converting constructor taking
+    // layout_left_padded<other_padding_stride>::mapping<OtherExtents>
+    // is in the proposal (missing other_padding_stride in R0),
+    // but missing here.
+
+    // layout_stride::mapping deliberately only defines the copy
+    // constructor and copy assignment operator, not the move
+    // constructor or move assignment operator.  This is fine because
+    // all the storage is std::array-like; there's no advantage to
+    // move construction or move assignment.  We imitate this.
+    MDSPAN_INLINE_FUNCTION_DEFAULTED
+    constexpr mapping(const mapping&) noexcept = default;
+    MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED
+    mapping& operator=(const mapping&) noexcept = default;
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr extents_type extents() const noexcept
+    {
+      return details::layout_right_extents<extents_type>(
+	inner_mapping_.extents(),
+	unpadded_extent_
+      );
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr std::array<index_type, extents_type::rank()>
+    strides() const noexcept
+    {
+      return inner_mapping_.strides();
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type required_span_size() const noexcept
+    {
+      return inner_mapping_.required_span_size();
+    }
+
+    MDSPAN_TEMPLATE_REQUIRES(
+      class... Indices,
+      /* requires */ (sizeof...(Indices) == Extents::rank() &&
+        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) &&
+	_MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/)
+      )
+    )
+    MDSPAN_INLINE_FUNCTION
+    constexpr size_t operator()(Indices... idxs) const noexcept {
+      // TODO (mfh 2022/08/30) in debug mode, check precondition before forwarding to inner mapping.
+      return inner_mapping_(std::forward<Indices>(idxs)...);
+    }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept {
+      return extents_type::rank() == 0 ? true :
+	(extents_type::static_extent(Extents::rank() - 1) != stdex::dynamic_extent &&
+	 extents_type::static_extent(Extents::rank() - 1) == unpadded_extent_type::static_extent(0));
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept {
+      return extents_type::rank() == 0 ? true :
+	inner_mapping_.extent(Extents::rank() - 1) == unpadded_extent_.extent(0);
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type stride(rank_type r) const noexcept {
+      return inner_mapping_.stride(r);
+    }
+  };
+};
+
+}  // end namespace experimental
+}  // end namespace std
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan
index ca6f6b8686..c5fac4c0bd 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan
@@ -44,6 +44,7 @@
 #pragma once
 
 #include "__p0009_bits/default_accessor.hpp"
+#include "__p0009_bits/aligned_accessor.hpp"
 #include "__p0009_bits/full_extent_t.hpp"
 #include "__p0009_bits/mdspan.hpp"
 #include "__p0009_bits/dynamic_extent.hpp"
@@ -51,6 +52,7 @@
 #include "__p0009_bits/layout_stride.hpp"
 #include "__p0009_bits/layout_left.hpp"
 #include "__p0009_bits/layout_right.hpp"
+#include "__p0009_bits/layout_padded.hpp"
 #include "__p0009_bits/macros.hpp"
 #include "__p0009_bits/static_array.hpp"
 #include "__p0009_bits/submdspan.hpp"
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
new file mode 100644
index 0000000000..8394ce83b8
--- /dev/null
+++ b/cpp/include/raft/util/cache.cuh
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+
+#include <raft/core/interruptible.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/util/cache_util.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cstddef>
+
+namespace raft::cache {
+
+/**
+ * @brief Associative cache with least recently used replacement policy.
+ *
+ * SW managed cache in device memory, for ML algos where we can trade memory
+ * access for computation. The two main functions of this class are the
+ * management of cache indices, and methods to retrieve/store data using the
+ * cache indices.
+ *
+ * The index management can be considered as a hash map<int, int>, where the int
+ * keys are the original vector indices that we want to store, and the values are
+ * the cache location of these vectors. The keys are hashed into a bucket
+ * whose size equals the associativity. These are the cache sets. If a cache
+ * set is full, then new indices are stored by replacing the oldest entries.
+ *
+ * Using this index mapping we implement methods to store and retrive data from
+ * the cache buffer, where a unit of data that we are storing is math_t[n_vec].
+ * For example in SVM we store full columns of the kernel matrix at each cache
+ * entry.
+ *
+ * Note: we should have a look if the index management could be simplified using
+ * concurrent_unordered_map.cuh from cudf. See Issue #914.
+ *
+ * Example usage:
+ * @code{.cpp}
+ *
+ * // An expensive calculation that we want to accelerate with caching:
+ * // we have n keys, and for each key we generate a vector with m elements.
+ * // The keys and the output values are stored in GPU memory.
+ * void calc(int *key, int n, int m, float *out, cudaStream_t stream) {
+ *   for (k=0; k<n; k++) {
+ *     // use key[k] to generate out[i + m*k],  where i=0..m-1
+ *   }
+ * }
+ *
+ * // We assume that our ML algo repeatedly calls calc, and the set of keys have
+ * // an overlap. We will use the cache to avoid repeated calculations.
+ *
+ * // Assume we have raft::handle_t& h, and cudaStream_t stream
+ * Cache<float> cache(h.get_device_allocator(), stream, m);
+ *
+ * // A buffer that we will reuse to store the cache indices.
+ * rmm::device_uvector<int> cache_idx(h.get_device_allocator(), stream, n);
+ *
+ * void cached_calc(int *key, int n, int m, float *out, stream) {
+ *   int n_cached = 0;
+ *
+ *   cache.GetCacheIdxPartitioned(key, n, cache_idx.data(), &n_cached,
+ *                                cudaStream_t stream);
+ *
+ *   // Note: GetCacheIdxPartitioned has reordered the keys so that
+ *   // key[0..n_cached-1] are the keys already in the cache.
+ *   // We collect the corresponding values
+ *   cache.GetVecs(cache_idx.data(), n_cached, out, stream);
+ *
+ *   // Calculate the elements not in the cache
+ *   int non_cached = n - n_cached;
+ *   if (non_cached > 0) {
+ *     int *key_new = key + n_cached;
+ *     int *cache_idx_new = cache_idx.data() + n_cached;
+ *     float *out_new = out + n_cached * m;
+ *     // AssignCacheIdx can permute the keys, therefore it has to come before
+ *     // we call calc.
+ *     // Note: a call to AssignCacheIdx should always be preceded with
+ *     // GetCacheIdxPartitioned, because that initializes the cache_idx_new array
+ *     // with the cache set (hash bucket) that correspond to the keys.
+ *     // The cache idx will be assigned from that cache set.
+ *     cache.AssignCacheIdx(key_new, non_cached, cache_idx_new, stream);
+ *
+ *     calc(key_new, non_cached, m, out_new, stream);
+ *
+ *     // Store the calculated vectors into the cache.
+ *     cache.StoreVecs(out_new, non_cached, non_cached, cache_idx_new, stream);
+ *    }
+ * }
+ * @endcode
+ */
+template <typename math_t, int associativity = 32>
+class Cache {
+ public:
+  /**
+   * @brief Construct a Cache object
+   *
+   * @tparam math_t type of elements to be cached
+   * @tparam associativity number of vectors in a cache set
+   *
+   * @param stream cuda stream
+   * @param n_vec number of elements in a single vector that is stored in a
+   *   cache entry
+   * @param cache_size in MiB
+   */
+  Cache(cudaStream_t stream, int n_vec, float cache_size = 200)
+    : n_vec(n_vec),
+      cache_size(cache_size),
+      cache(0, stream),
+      cached_keys(0, stream),
+      cache_time(0, stream),
+      is_cached(0, stream),
+      ws_tmp(0, stream),
+      idx_tmp(0, stream),
+      d_num_selected_out(stream),
+      d_temp_storage(0, stream)
+  {
+    ASSERT(n_vec > 0, "Parameter n_vec: shall be larger than zero");
+    ASSERT(associativity > 0, "Associativity shall be larger than zero");
+    ASSERT(cache_size >= 0, "Cache size should not be negative");
+
+    // Calculate how many vectors would fit the cache
+    int n_cache_vecs = (cache_size * 1024 * 1024) / (sizeof(math_t) * n_vec);
+
+    // The available memory shall be enough for at least one cache set
+    if (n_cache_vecs >= associativity) {
+      n_cache_sets = n_cache_vecs / associativity;
+      n_cache_vecs = n_cache_sets * associativity;
+      cache.resize(n_cache_vecs * n_vec, stream);
+      cached_keys.resize(n_cache_vecs, stream);
+      cache_time.resize(n_cache_vecs, stream);
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(cached_keys.data(), 0, cached_keys.size() * sizeof(int), stream));
+      RAFT_CUDA_TRY(cudaMemsetAsync(cache_time.data(), 0, cache_time.size() * sizeof(int), stream));
+    } else {
+      if (cache_size > 0) {
+        RAFT_LOG_WARN(
+          "Warning: not enough memory to cache a single set of "
+          "rows, not using cache");
+      }
+      n_cache_sets = 0;
+      cache_size   = 0;
+    }
+    RAFT_LOG_DEBUG(
+      "Creating cache with size=%f MiB, to store %d vectors, in "
+      "%d sets with associativity=%d",
+      cache_size,
+      n_cache_vecs,
+      n_cache_sets,
+      associativity);
+  }
+
+  Cache(const Cache& other) = delete;
+
+  Cache& operator=(const Cache& other) = delete;
+
+  /** @brief Collect cached data into contiguous memory space.
+   *
+   * On exit, the tile array is filled the following way:
+   * out[i + n_vec*k] = cache[i + n_vec * idx[k]]), where i=0..n_vec-1,
+   * k = 0..n-1
+   *
+   * Idx values less than 0 are ignored.
+   *
+   * @param [in] idx cache indices, size [n]
+   * @param [in] n the number of vectors that need to be collected
+   * @param [out] out vectors collected from cache, size [n_vec*n]
+   * @param [in] stream cuda stream
+   */
+  void GetVecs(const int* idx, int n, math_t* out, cudaStream_t stream)
+  {
+    if (n > 0) {
+      get_vecs<<<raft::ceildiv(n * n_vec, TPB), TPB, 0, stream>>>(cache.data(), n_vec, idx, n, out);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+
+  /** @brief Store vectors of data into the cache.
+   *
+   * Roughly the opposite of GetVecs, but the input vectors can be scattered
+   * in memory. The cache is updated using the following formula:
+   *
+   * cache[i + cache_idx[k]*n_vec] = tile[i + tile_idx[k]*n_vec],
+   * for i=0..n_vec-1, k=0..n-1
+   *
+   * If tile_idx==nullptr, then we assume tile_idx[k] = k.
+   *
+   * Elements within a vector should be contiguous in memory (i.e. column vectors
+   * for column major data storage, or row vectors of row major data).
+   *
+   * @param [in] tile stores the data to be cashed cached, size [n_vec x n_tile]
+   * @param [in] n_tile number of vectors in tile (at least n)
+   * @param [in] n number of vectors that need to be stored in the cache (a subset
+   *   of all the vectors in the tile)
+   * @param [in] cache_idx cache indices for storing the vectors (negative values
+   *   are ignored), size [n]
+   * @param [in] stream cuda stream
+   * @param [in] tile_idx indices of vectors that need to be stored
+   */
+  void StoreVecs(const math_t* tile,
+                 int n_tile,
+                 int n,
+                 int* cache_idx,
+                 cudaStream_t stream,
+                 const int* tile_idx = nullptr)
+  {
+    if (n > 0) {
+      store_vecs<<<raft::ceildiv(n * n_vec, TPB), TPB, 0, stream>>>(
+        tile, n_tile, n_vec, tile_idx, n, cache_idx, cache.data(), cache.size() / n_vec);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+
+  /** @brief Map a set of keys to cache indices.
+   *
+   * For each k in 0..n-1, if keys[k] is found in the cache, then cache_idx[k]
+   * will tell the corresponding cache idx, and is_cached[k] is set to true.
+   *
+   * If keys[k] is not found in the cache, then is_cached[k] is set to false.
+   * In this case we assign the cache set for keys[k], and cache_idx[k] will
+   * store the cache set.
+   *
+   * @note in order to retrieve the cached vector j=cache_idx[k] from the cache,
+   *  we have to access cache[i + j*n_vec], where i=0..n_vec-1.
+   *
+   * @note: do not use simultaneous GetCacheIdx and AssignCacheIdx
+   *
+   * @param [in] keys device array of keys, size [n]
+   * @param [in] n number of keys
+   * @param [out] cache_idx device array of cache indices corresponding to the
+   *   input keys, size [n]
+   * @param [out] is_cached whether the element is already available in the
+   *   cache, size [n]
+   * @param [in] stream
+   */
+  void GetCacheIdx(int* keys, int n, int* cache_idx, bool* is_cached, cudaStream_t stream)
+  {
+    n_iter++;  // we increase the iteration counter, that is used to time stamp
+    // accessing entries from the cache
+    get_cache_idx<<<raft::ceildiv(n, TPB), TPB, 0, stream>>>(keys,
+                                                             n,
+                                                             cached_keys.data(),
+                                                             n_cache_sets,
+                                                             associativity,
+                                                             cache_time.data(),
+                                                             cache_idx,
+                                                             is_cached,
+                                                             n_iter);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+  /** @brief Map a set of keys to cache indices.
+   *
+   * Same as GetCacheIdx, but partitions the keys, and cache_idx arrays in a way
+   * that keys[0..n_cached-1] and cache_idx[0..n_cached-1] store the indices of
+   * vectors that are found in the cache, while keys[n_cached..n-1] are the
+   * indices of vectors that are not found in the cache. For the vectors not
+   * found in the cache, cache_idx[n_cached..n-1] stores the cache set, and this
+   * can be used to call AssignCacheIdx.
+   *
+   * @param [inout] keys device array of keys, size [n]
+   * @param [in] n number of indices
+   * @param [out] cache_idx device array of cache indices corresponding to
+   *   the input keys, size [n]
+   * @param [out] n_cached number of elements that are cached
+   * @param [in] stream cuda stream
+   */
+  void GetCacheIdxPartitioned(int* keys, int n, int* cache_idx, int* n_cached, cudaStream_t stream)
+  {
+    ResizeTmpBuffers(n, stream);
+
+    GetCacheIdx(keys, n, ws_tmp.data(), is_cached.data(), stream);
+
+    // Group cache indices as [already cached, non_cached]
+    cub::DevicePartition::Flagged(d_temp_storage.data(),
+                                  d_temp_storage_size,
+                                  ws_tmp.data(),
+                                  is_cached.data(),
+                                  cache_idx,
+                                  d_num_selected_out.data(),
+                                  n,
+                                  stream);
+
+    raft::update_host(n_cached, d_num_selected_out.data(), 1, stream);
+
+    // Similarily re-group the input indices
+    raft::copy(ws_tmp.data(), keys, n, stream);
+    cub::DevicePartition::Flagged(d_temp_storage.data(),
+                                  d_temp_storage_size,
+                                  ws_tmp.data(),
+                                  is_cached.data(),
+                                  keys,
+                                  d_num_selected_out.data(),
+                                  n,
+                                  stream);
+
+    raft::interruptible::synchronize(stream);
+  }
+
+  /**
+   * @brief Assign cache location to a set of keys.
+   *
+   * Note: call GetCacheIdx first, to get the cache_set assigned to the keys.
+   * Keys that cannot be cached are assigned to -1.
+   *
+   * @param [inout] keys device array of keys, size [n]
+   * @param [in] n number of elements that we want to cache
+   * @param [inout] cidx on entry: cache_set, on exit: assigned cache_idx or -1,
+   *   size[n]
+   * @param [in] stream cuda stream
+   */
+  void AssignCacheIdx(int* keys, int n, int* cidx, cudaStream_t stream)
+  {
+    if (n <= 0) return;
+    cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
+                                    d_temp_storage_size,
+                                    cidx,
+                                    ws_tmp.data(),
+                                    keys,
+                                    idx_tmp.data(),
+                                    n,
+                                    0,
+                                    sizeof(int) * 8,
+                                    stream);
+
+    raft::copy(keys, idx_tmp.data(), n, stream);
+
+    // set it to -1
+    RAFT_CUDA_TRY(cudaMemsetAsync(cidx, 255, n * sizeof(int), stream));
+    const int nthreads = associativity <= 32 ? associativity : 32;
+
+    assign_cache_idx<nthreads, associativity><<<n_cache_sets, nthreads, 0, stream>>>(
+      keys, n, ws_tmp.data(), cached_keys.data(), n_cache_sets, cache_time.data(), n_iter, cidx);
+
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    if (debug_mode) RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  }
+
+  /** Return approximate cache size in MiB. */
+  float GetSizeInMiB() const { return cache_size; }
+
+  /**
+   * Returns the number of vectors that can be cached.
+   */
+  int GetSize() const { return cached_keys.size(); }
+
+ private:
+  int n_vec;         //!< Number of elements in a cached vector
+  float cache_size;  //!< in MiB
+  int n_cache_sets;  //!< number of cache sets
+
+  const int TPB = 256;  //!< threads per block for kernel launch
+  int n_iter    = 0;    //!< Counter for time stamping cache operation
+
+  bool debug_mode = false;
+
+  rmm::device_uvector<math_t> cache;     //!< The value of cached vectors
+  rmm::device_uvector<int> cached_keys;  //!< Keys stored at each cache loc
+  rmm::device_uvector<int> cache_time;   //!< Time stamp for LRU cache
+
+  // Helper arrays for GetCacheIdx
+  rmm::device_uvector<bool> is_cached;
+  rmm::device_uvector<int> ws_tmp;
+  rmm::device_uvector<int> idx_tmp;
+
+  // Helper arrays for cub
+  rmm::device_scalar<int> d_num_selected_out;
+  rmm::device_uvector<char> d_temp_storage;
+  size_t d_temp_storage_size = 0;
+
+  void ResizeTmpBuffers(int n, cudaStream_t stream)
+  {
+    if (ws_tmp.size() < static_cast<std::size_t>(n)) {
+      ws_tmp.resize(n, stream);
+      is_cached.resize(n, stream);
+      idx_tmp.resize(n, stream);
+      cub::DevicePartition::Flagged(NULL,
+                                    d_temp_storage_size,
+                                    cached_keys.data(),
+                                    is_cached.data(),
+                                    cached_keys.data(),
+                                    d_num_selected_out.data(),
+                                    n,
+                                    stream);
+      d_temp_storage.resize(d_temp_storage_size, stream);
+    }
+  }
+};
+};  // namespace raft::cache
diff --git a/cpp/include/raft/util/fast_int_div.cuh b/cpp/include/raft/util/fast_int_div.cuh
new file mode 100644
index 0000000000..a0cb8f0f53
--- /dev/null
+++ b/cpp/include/raft/util/fast_int_div.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+#include <stdint.h>
+
+namespace raft::util {
+
+/**
+ * @brief Perform fast integer division and modulo using a known divisor
+ * From Hacker's Delight, Second Edition, Chapter 10
+ *
+ * @note This currently only supports 32b signed integers
+ * @todo Extend support for signed divisors
+ */
+struct FastIntDiv {
+  /**
+   * @defgroup HostMethods Ctor's that are accessible only from host
+   * @{
+   * @brief Host-only ctor's
+   * @param _d the divisor
+   */
+  FastIntDiv(int _d) : d(_d) { computeScalars(); }
+  FastIntDiv& operator=(int _d)
+  {
+    d = _d;
+    computeScalars();
+    return *this;
+  }
+  /** @} */
+
+  /**
+   * @defgroup DeviceMethods Ctor's which even the device-side can access
+   * @{
+   * @brief host and device ctor's
+   * @param other source object to be copied from
+   */
+  HDI FastIntDiv(const FastIntDiv& other) : d(other.d), m(other.m), p(other.p) {}
+  HDI FastIntDiv& operator=(const FastIntDiv& other)
+  {
+    d = other.d;
+    m = other.m;
+    p = other.p;
+    return *this;
+  }
+  /** @} */
+
+  /** divisor */
+  int d;
+  /** the term 'm' as found in the reference chapter */
+  unsigned m;
+  /** the term 'p' as found in the reference chapter */
+  int p;
+
+ private:
+  void computeScalars()
+  {
+    if (d == 1) {
+      m = 0;
+      p = 1;
+      return;
+    } else if (d < 0) {
+      ASSERT(false, "FastIntDiv: division by negative numbers not supported!");
+    } else if (d == 0) {
+      ASSERT(false, "FastIntDiv: got division by zero!");
+    }
+    int64_t nc = ((1LL << 31) / d) * d - 1;
+    p          = 31;
+    int64_t twoP, rhs;
+    do {
+      ++p;
+      twoP = 1LL << p;
+      rhs  = nc * (d - twoP % d);
+    } while (twoP <= rhs);
+    m = (twoP + d - twoP % d) / d;
+  }
+};  // struct FastIntDiv
+
+/**
+ * @brief Division overload, so that FastIntDiv can be transparently switched
+ *        to even on device
+ * @param n numerator
+ * @param divisor the denominator
+ * @return the quotient
+ */
+HDI int operator/(int n, const FastIntDiv& divisor)
+{
+  if (divisor.d == 1) return n;
+  int ret = (int64_t(divisor.m) * int64_t(n)) >> divisor.p;
+  if (n < 0) ++ret;
+  return ret;
+}
+
+/**
+ * @brief Modulo overload, so that FastIntDiv can be transparently switched
+ *        to even on device
+ * @param n numerator
+ * @param divisor the denominator
+ * @return the remainder
+ */
+HDI int operator%(int n, const FastIntDiv& divisor)
+{
+  int quotient  = n / divisor;
+  int remainder = n - quotient * divisor.d;
+  return remainder;
+}
+
+};  // namespace raft::util
diff --git a/cpp/include/raft_distance/fused_l2_min_arg.hpp b/cpp/include/raft_distance/fused_l2_min_arg.hpp
new file mode 100644
index 0000000000..f7d3748666
--- /dev/null
+++ b/cpp/include/raft_distance/fused_l2_min_arg.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+
+namespace raft::distance::runtime {
+
+/**
+ * @brief Wrapper around fusedL2NN with minimum reduction operators.
+ *
+ * fusedL2NN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ *
+ * @param[in] handle         raft handle
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ */
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const float* x,
+                         const float* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt);
+
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const double* x,
+                         const double* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt);
+
+}  // end namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu
new file mode 100644
index 0000000000..c722b5a566
--- /dev/null
+++ b/cpp/src/distance/fused_l2_min_arg.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/kvp.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/distance/specializations.cuh>
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+
+namespace raft::distance::runtime {
+
+template <typename IndexT, typename DataT>
+struct KeyValueIndexOp {
+  __host__ __device__ __forceinline__ IndexT
+  operator()(const raft::KeyValuePair<IndexT, DataT>& a) const
+  {
+    return a.key;
+  }
+};
+
+template <typename value_t, typename idx_t>
+void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
+                                 idx_t* min,
+                                 const value_t* x,
+                                 const value_t* y,
+                                 idx_t m,
+                                 idx_t n,
+                                 idx_t k,
+                                 bool sqrt)
+{
+  rmm::device_uvector<int> workspace(m, handle.get_stream());
+  auto kvp = raft::make_device_vector<raft::KeyValuePair<idx_t, value_t>>(handle, m);
+
+  rmm::device_uvector<value_t> x_norms(m, handle.get_stream());
+  rmm::device_uvector<value_t> y_norms(n, handle.get_stream());
+  raft::linalg::rowNorm(x_norms.data(), x, k, m, raft::linalg::L2Norm, true, handle.get_stream());
+  raft::linalg::rowNorm(y_norms.data(), y, k, n, raft::linalg::L2Norm, true, handle.get_stream());
+
+  fusedL2NNMinReduce(kvp.data_handle(),
+                     x,
+                     y,
+                     x_norms.data(),
+                     y_norms.data(),
+                     m,
+                     n,
+                     k,
+                     (void*)workspace.data(),
+                     sqrt,
+                     true,
+                     handle.get_stream());
+
+  KeyValueIndexOp<idx_t, value_t> conversion_op;
+  thrust::transform(
+    handle.get_thrust_policy(), kvp.data_handle(), kvp.data_handle() + m, min, conversion_op);
+  handle.sync_stream();
+}
+
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const float* x,
+                         const float* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt)
+{
+  compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
+}
+
+void fused_l2_nn_min_arg(raft::handle_t const& handle,
+                         int* min,
+                         const double* x,
+                         const double* y,
+                         int m,
+                         int n,
+                         int k,
+                         bool sqrt)
+{
+  compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
+}
+
+}  // end namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
new file mode 100644
index 0000000000..c893e9a358
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+
+template class raft::distance::kernels::detail::GramMatrixBase<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
new file mode 100644
index 0000000000..3265f828e6
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+
+template class raft::distance::kernels::detail::GramMatrixBase<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
new file mode 100644
index 0000000000..0edf45a6f1
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
new file mode 100644
index 0000000000..a719175e6b
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
new file mode 100644
index 0000000000..6577e1b6c7
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::RBFKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
new file mode 100644
index 0000000000..1d2582cf81
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
new file mode 100644
index 0000000000..13d5159504
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::TanhKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
new file mode 100644
index 0000000000..ee62de7d34
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::TanhKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover.cu
index 15af9f6e68..b608a1a865 100644
--- a/cpp/src/nn/specializations/ball_cover.cu
+++ b/cpp/src/nn/specializations/ball_cover.cu
@@ -14,28 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/ball_cover.cuh>
-#include <raft/spatial/knn/ball_cover_types.hpp>
+#include <raft/neighbors/ball_cover.cuh>
+#include <raft/neighbors/ball_cover_types.hpp>
 
 // Ignore upstream specializations to avoid unnecessary recompiling
+#ifdef RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
-#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
-#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
-#include <raft/spatial/knn/specializations/knn.cuh>
+#endif
+
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
+#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+#include <raft/neighbors/specializations/knn.cuh>
 
 #include <cstdint>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::neighbors::ball_cover {
 template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
 template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
 
-template void rbc_build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
+template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
 
-template void rbc_knn_query<std::int64_t, float, std::uint32_t>(
+template void knn_query<std::int64_t, float, std::uint32_t>(
   const raft::handle_t& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
@@ -46,7 +47,7 @@ template void rbc_knn_query<std::int64_t, float, std::uint32_t>(
   bool perform_post_filtering,
   float weight);
 
-template void rbc_all_knn_query<std::int64_t, float, std::uint32_t>(
+template void all_knn_query<std::int64_t, float, std::uint32_t>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
@@ -55,6 +56,4 @@ template void rbc_all_knn_query<std::int64_t, float, std::uint32_t>(
   bool perform_post_filtering,
   float weight);
 
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
+};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
index d2d729a52d..961af0b89c 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
@@ -15,8 +15,8 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
index 0b32d43ba9..daa509b5b1 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
@@ -15,8 +15,8 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
index 7c8f18859f..9487641945 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
@@ -15,8 +15,8 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
index 1ef071033c..c07ed45427 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
@@ -15,8 +15,8 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
index 52544995ad..21f0b3f976 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
index 8dbd0bf37f..78c7eebae0 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
index 7f141e377e..9cb8dddf13 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
index 570b9a83ae..079aa796c6 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
index 7d66dd1239..ed69e70116 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
index 2d07f1ec58..4e9d441910 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
index 964f8e1836..4ae2a073ce 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
index 338bfb16da..8277e41b5f 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
index 618812d8dd..f865ef167f 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
index e12ea80b93..0627d7a2b8 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
index 4014c0322d..af761191ee 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
index 604527af9a..e24663ca0b 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu b/cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
index b3715b642d..a32147b2b1 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu b/cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
index adac4942ad..f3e80206e4 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu b/cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
index 44d4505e3d..e732646f99 100644
--- a/cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
+++ b/cpp/src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index c55a3602d7..b8ec9347fb 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -82,7 +82,7 @@ if(BUILD_TESTS)
             PATH
             test/cluster/kmeans.cu
             test/cluster_solvers.cu
-            test/sparse/linkage.cu
+            test/cluster/linkage.cu
             OPTIONAL DIST NN
     )
 
@@ -116,6 +116,7 @@ if(BUILD_TESTS)
             test/distance/dist_minkowski.cu
             test/distance/dist_russell_rao.cu
             test/distance/fused_l2_nn.cu
+            test/distance/gram.cu
             OPTIONAL DIST
     )
 
@@ -216,34 +217,35 @@ if(BUILD_TESTS)
             OPTIONAL DIST NN
     )
 
-    ConfigureTest(NAME SPARSE_NN_TEST
+    ConfigureTest(NAME SPARSE_NEIGHBORS_TEST
             PATH
-            test/sparse/connect_components.cu
-            test/sparse/knn.cu
-            test/sparse/knn_graph.cu
+            test/sparse/neighbors/connect_components.cu
+            test/sparse/neighbors/brute_force.cu
+            test/sparse/neighbors/knn_graph.cu
             OPTIONAL DIST NN
     )
 
-    ConfigureTest(NAME SPATIAL_TEST
+    ConfigureTest(NAME NEIGHBORS_TEST
             PATH
-            test/spatial/ann_ivf_flat.cu
-            test/spatial/ann_ivf_pq/test_float_int64_t.cu
-            test/spatial/ann_ivf_pq/test_float_uint32_t.cu
-            test/spatial/ann_ivf_pq/test_float_uint64_t.cu
-            test/spatial/ann_ivf_pq/test_int8_t_uint64_t.cu
-            test/spatial/ann_ivf_pq/test_uint8_t_uint64_t.cu
-            test/spatial/knn.cu
-            test/spatial/fused_l2_knn.cu
-            test/spatial/haversine.cu
-            test/spatial/ball_cover.cu
-            test/spatial/epsilon_neighborhood.cu
-            test/spatial/faiss_mr.cu
-            test/spatial/selection.cu
+            test/neighbors/ann_ivf_flat.cu
+            test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+            test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+            test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
+            test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
+            test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
+            test/neighbors/knn.cu
+            test/neighbors/fused_l2_knn.cu
+            test/neighbors/haversine.cu
+            test/neighbors/ball_cover.cu
+            test/neighbors/epsilon_neighborhood.cu
+            test/neighbors/faiss_mr.cu
+            test/neighbors/selection.cu
             OPTIONAL DIST NN
     )
 
     ConfigureTest(NAME STATS_TEST
             PATH
+            test/stats/accuracy.cu
             test/stats/adjusted_rand_index.cu
             test/stats/completeness_score.cu
             test/stats/contingencyMatrix.cu
@@ -259,7 +261,9 @@ if(BUILD_TESTS)
             test/stats/mean_center.cu
             test/stats/minmax.cu
             test/stats/mutual_info_score.cu
+            test/stats/r2_score.cu
             test/stats/rand_index.cu
+            test/stats/regression_metrics.cu
             test/stats/silhouette_score.cu
             test/stats/stddev.cu
             test/stats/sum.cu
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/cluster/linkage.cu
similarity index 98%
rename from cpp/test/sparse/linkage.cu
rename to cpp/test/cluster/linkage.cu
index ce5741d06b..5533f552bd 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -180,20 +180,21 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
 
     raft::handle_t handle;
 
-    auto data_view =
-      raft::make_device_matrix_view<T, IdxT, row_major>(data.data(), params.n_row, params.n_col);
+    auto data_view = raft::make_device_matrix_view<const T, IdxT, row_major>(
+      data.data(), params.n_row, params.n_col);
     auto dendrogram_view =
       raft::make_device_matrix_view<IdxT, IdxT, row_major>(out_children.data(), params.n_row, 2);
     auto labels_view = raft::make_device_vector_view<IdxT, IdxT>(labels.data(), params.n_row);
 
-    raft::cluster::single_linkage<T, IdxT, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
-      handle,
-      data_view,
-      dendrogram_view,
-      labels_view,
-      raft::distance::DistanceType::L2SqrtExpanded,
-      params.n_clusters,
-      std::make_optional<int>(params.c));
+    raft::cluster::hierarchy::
+      single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::KNN_GRAPH>(
+        handle,
+        data_view,
+        dendrogram_view,
+        labels_view,
+        raft::distance::DistanceType::L2SqrtExpanded,
+        params.n_clusters,
+        std::make_optional<int>(params.c));
 
     handle.sync_stream(stream);
 
diff --git a/cpp/test/cluster_solvers_deprecated.cu b/cpp/test/cluster_solvers_deprecated.cu
index 1e9ec0c15b..167a710b34 100644
--- a/cpp/test/cluster_solvers_deprecated.cu
+++ b/cpp/test/cluster_solvers_deprecated.cu
@@ -20,7 +20,6 @@
 #include <raft/core/handle.hpp>
 
 #include <raft/spectral/cluster_solvers_deprecated.cuh>
-#include <raft/spectral/modularity_maximization.cuh>
 
 namespace raft {
 namespace spectral {
@@ -54,52 +53,5 @@ TEST(Raft, ClusterSolvers)
   EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
 }
 
-TEST(Raft, ModularitySolvers)
-{
-  using namespace matrix;
-  using index_type = int;
-  using value_type = double;
-
-  handle_t h;
-  ASSERT_EQ(0,
-            h.
-
-            get_device()
-
-  );
-
-  index_type neigvs{10};
-  index_type maxiter{100};
-  index_type restart_iter{10};
-  value_type tol{1.0e-10};
-  bool reorthog{true};
-
-  // nullptr expected to trigger exceptions:
-  //
-  index_type* clusters{nullptr};
-  value_type* eigvals{nullptr};
-  value_type* eigvecs{nullptr};
-
-  unsigned long long seed{100110021003};
-
-  eigen_solver_config_t<index_type, value_type> eig_cfg{
-    neigvs, maxiter, restart_iter, tol, reorthog, seed};
-  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
-
-  index_type k{5};
-
-  cluster_solver_config_deprecated_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
-  kmeans_solver_deprecated_t<index_type, value_type> cluster_solver{clust_cfg};
-
-  auto stream = h.get_stream();
-  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
-
-  EXPECT_ANY_THROW(spectral::modularity_maximization(
-    h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
-
-  value_type modularity{0};
-  EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
-}
-
 }  // namespace spectral
 }  // namespace raft
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
new file mode 100644
index 0000000000..cf7215bddb
--- /dev/null
+++ b/cpp/test/distance/gram.cu
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.hpp>
+#endif
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::distance::kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_"
+     << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+const std::vector<GramMatrixInputs> inputs = {
+  {42, 137, 2, false, {KernelType::LINEAR}},
+  {42, 137, 2, true, {KernelType::LINEAR}},
+  {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {3, 4, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, {KernelType::RBF, 0, 0.5}},
+  // Distance kernel does not support LD parameter yet.
+  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
+  // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
+};
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()), stream(0), x1(0, stream), x2(0, stream), gram(0, stream), gram_host(0)
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the ouptut from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::Rng r(42137ULL);
+    r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream);
+    r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream);
+  }
+
+  ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
+
+  // Calculate the Gram matrix on the host.
+  void naiveKernel()
+  {
+    std::vector<math_t> x1_host(x1.size());
+    raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+    std::vector<math_t> x2_host(x2.size());
+    raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+    handle.sync_stream(stream);
+
+    for (int i = 0; i < params.n1; i++) {
+      for (int j = 0; j < params.n2; j++) {
+        float d = 0;
+        for (int k = 0; k < params.n_cols; k++) {
+          if (params.kernel.kernel == KernelType::RBF) {
+            math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] -
+                          x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
+            d += diff * diff;
+          } else {
+            d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] *
+                 x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
+          }
+        }
+        int idx  = get_offset(i, j, params.ld_out, params.is_row_major);
+        math_t v = 0;
+        switch (params.kernel.kernel) {
+          case (KernelType::LINEAR): gram_host[idx] = d; break;
+          case (KernelType::POLYNOMIAL):
+            v              = params.kernel.gamma * d + params.kernel.coef0;
+            gram_host[idx] = std::pow(v, params.kernel.degree);
+            break;
+          case (KernelType::TANH):
+            gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0);
+            break;
+          case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break;
+        }
+      }
+    }
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel = std::unique_ptr<GramMatrixBase<math_t>>(
+      KernelFactory<math_t>::create(params.kernel, handle.get_cublas_handle()));
+
+    kernel->evaluate(x1.data(),
+                     params.n1,
+                     params.n_cols,
+                     x2.data(),
+                     params.n2,
+                     gram.data(),
+                     params.is_row_major,
+                     stream,
+                     params.ld1,
+                     params.ld2,
+                     params.ld_out);
+    naiveKernel();
+    ASSERT_TRUE(raft::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
+  }
+
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloat;
+typedef GramMatrixTest<double> GramMatrixTestDouble;
+
+TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+};  // end namespace raft::distance::kernels
\ No newline at end of file
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index e575f37dd6..7b124cb7bb 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -103,7 +103,7 @@ class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
     raft::random::RngState r(params.seed);
     raft::random::RngState r_int(params.seed);
 
-    int nobs       = params.nobs;
+    uint32_t nobs  = params.nobs;
     uint32_t cols  = params.cols;
     uint32_t nkeys = params.nkeys;
     uniform(handle, r, in.data(), nobs * cols, T(0.0), T(2.0 / nobs));
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 9d3d5af51e..931c3ecb16 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -64,10 +64,10 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     I m = rowmajor ? lineLen : nLines;
     I n = rowmajor ? nLines : lineLen;
 
-    auto in_view  = raft::make_device_matrix_view<const T, I, layout>(in, m, n);
-    auto out_view = raft::make_device_matrix_view<T, I, layout>(out, m, n);
+    auto in_view  = raft::make_device_matrix_view<const T, I, layout>(in, n, m);
+    auto out_view = raft::make_device_matrix_view<T, I, layout>(out, n, m);
 
-    auto vec_view = raft::make_device_vector_view<const T>(vec, m);
+    auto vec_view = raft::make_device_vector_view<const T>(vec, lineLen);
     matrix::linewise_op(handle, in_view, out_view, raft::is_row_major(in_view), f, vec_view);
   }
 
@@ -81,10 +81,10 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     I m = rowmajor ? lineLen : nLines;
     I n = rowmajor ? nLines : lineLen;
 
-    auto in_view   = raft::make_device_matrix_view<const T, I, layout>(in, m, n);
-    auto out_view  = raft::make_device_matrix_view<T, I, layout>(out, m, n);
-    auto vec1_view = raft::make_device_vector_view<const T, I>(vec1, m);
-    auto vec2_view = raft::make_device_vector_view<const T, I>(vec2, m);
+    auto in_view   = raft::make_device_matrix_view<const T, I, layout>(in, n, m);
+    auto out_view  = raft::make_device_matrix_view<T, I, layout>(out, n, m);
+    auto vec1_view = raft::make_device_vector_view<const T, I>(vec1, lineLen);
+    auto vec2_view = raft::make_device_vector_view<const T, I>(vec2, lineLen);
 
     matrix::linewise_op(
       handle, in_view, out_view, raft::is_row_major(in_view), f, vec1_view, vec2_view);
@@ -99,6 +99,19 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     return blob;
   }
 
+  template <typename layout>
+  void runLinewiseSumPadded(raft::device_aligned_matrix_view<T, I, layout> out,
+                            raft::device_aligned_matrix_view<const T, I, layout> in,
+                            const I lineLen,
+                            const I nLines,
+                            const bool alongLines,
+                            const T* vec)
+  {
+    auto f        = [] __device__(T a, T b) -> T { return a + b; };
+    auto vec_view = raft::make_device_vector_view<const T, I>(vec, lineLen);
+    matrix::linewise_op(handle, in, out, alongLines, f, vec_view);
+  }
+
   /**
    * Suggest multiple versions of matrix dimensions (n, m), such that
    *
@@ -205,11 +218,129 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     return r;
   }
 
+  testing::AssertionResult runWithPaddedSpan(std::vector<std::tuple<I, I>>&& dims,
+                                             rmm::device_uvector<T>&& blob)
+  {
+    rmm::device_uvector<T> blob_val(params.checkCorrectness ? blob.size() / 2 : 0, stream);
+
+    stream.synchronize();
+    cudaProfilerStart();
+    testing::AssertionResult r = testing::AssertionSuccess();
+    for (auto alongRows : ::testing::Bool()) {
+      for (auto [n, m] : dims) {
+        if (!r) break;
+        // take dense testdata
+        auto [out, in, vec1, vec2] = assignSafePtrs(blob, n, m);
+        common::nvtx::range dims_scope("Dims-%zu-%zu", std::size_t(n), std::size_t(m));
+        common::nvtx::range dir_scope(alongRows ? "alongRows" : "acrossRows");
+
+        auto lineLen = m;
+        auto nLines  = n;
+
+        // create a padded span based on testdata (just for functional testing)
+        size_t matrix_size_padded;
+        if (alongRows) {
+          auto extents = matrix_extent<I>{n, m};
+          typename raft::layout_right_padded<T>::mapping<matrix_extent<I>> layout{extents};
+          matrix_size_padded = layout.required_span_size();
+        } else {
+          auto extents = matrix_extent<I>{m, n};
+          typename raft::layout_left_padded<T>::mapping<matrix_extent<I>> layout{extents};
+          matrix_size_padded = layout.required_span_size();
+        }
+
+        rmm::device_uvector<T> blob_in(matrix_size_padded, stream);
+        rmm::device_uvector<T> blob_out(matrix_size_padded, stream);
+
+        {
+          auto in2 = in;
+
+          // actual testrun
+          common::nvtx::range vecs_scope("one vec");
+          if (alongRows) {
+            auto inSpan = make_device_aligned_matrix_view<T, I, raft::layout_right_padded<T>>(
+              blob_in.data(), nLines, lineLen);
+            auto outSpan = make_device_aligned_matrix_view<T, I, raft::layout_right_padded<T>>(
+              blob_out.data(), nLines, lineLen);
+            // prep padded input data
+            thrust::for_each_n(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0ul),
+                               nLines * lineLen,
+                               [inSpan, in2, lineLen] __device__(size_t i) {
+                                 inSpan(i / lineLen, i % lineLen) = in2[i];
+                               });
+            auto inSpanConst =
+              make_device_aligned_matrix_view<const T, I, raft::layout_right_padded<T>>(
+                blob_in.data(), nLines, lineLen);
+            runLinewiseSumPadded<raft::layout_right_padded<T>>(
+              outSpan, inSpanConst, lineLen, nLines, alongRows, vec1);
+
+            if (params.checkCorrectness) {
+              runLinewiseSum<raft::row_major>(out, in, lineLen, nLines, vec1);
+              auto out_dense = blob_val.data();
+              thrust::for_each_n(rmm::exec_policy(stream),
+                                 thrust::make_counting_iterator(0ul),
+                                 nLines * lineLen,
+                                 [outSpan, out_dense, lineLen] __device__(size_t i) {
+                                   out_dense[i] = outSpan(i / lineLen, i % lineLen);
+                                 });
+              r = devArrMatch(out_dense, out, n * m, CompareApprox<T>(params.tolerance))
+                  << " " << (alongRows ? "alongRows" : "acrossRows")
+                  << " with one vec;  lineLen: " << lineLen << "; nLines " << nLines;
+              if (!r) break;
+            }
+
+          } else {
+            auto inSpan = make_device_aligned_matrix_view<T, I, raft::layout_left_padded<T>>(
+              blob_in.data(), lineLen, nLines);
+            auto outSpan = make_device_aligned_matrix_view<T, I, raft::layout_left_padded<T>>(
+              blob_out.data(), lineLen, nLines);
+            // prep padded input data
+            thrust::for_each_n(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0ul),
+                               nLines * lineLen,
+                               [inSpan, in2, lineLen] __device__(size_t i) {
+                                 inSpan(i % lineLen, i / lineLen) = in2[i];
+                               });
+            auto inSpanConst =
+              make_device_aligned_matrix_view<const T, I, raft::layout_left_padded<T>>(
+                blob_in.data(), lineLen, nLines);
+            runLinewiseSumPadded<raft::layout_left_padded<T>>(
+              outSpan, inSpanConst, lineLen, nLines, alongRows, vec1);
+
+            if (params.checkCorrectness) {
+              runLinewiseSum<raft::col_major>(out, in, lineLen, nLines, vec1);
+              auto out_dense = blob_val.data();
+              thrust::for_each_n(rmm::exec_policy(stream),
+                                 thrust::make_counting_iterator(0ul),
+                                 nLines * lineLen,
+                                 [outSpan, out_dense, lineLen] __device__(size_t i) {
+                                   out_dense[i] = outSpan(i % lineLen, i / lineLen);
+                                 });
+              r = devArrMatch(out_dense, out, n * m, CompareApprox<T>(params.tolerance))
+                  << " " << (alongRows ? "alongRows" : "acrossRows")
+                  << " with one vec;  lineLen: " << lineLen << "; nLines " << nLines;
+              if (!r) break;
+            }
+          }
+        }
+      }
+    }
+    cudaProfilerStop();
+
+    return r;
+  }
+
   testing::AssertionResult run()
   {
     return run(suggestDimensions(2), genData(params.workSizeBytes));
   }
 
+  testing::AssertionResult runWithPaddedSpan()
+  {
+    return runWithPaddedSpan(suggestDimensions(2), genData(params.workSizeBytes));
+  }
+
   testing::AssertionResult runEdgeCases()
   {
     std::vector<I> sizes = {1, 2, 3, 4, 7, 16};
@@ -230,6 +361,13 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
   TEST_P(TestClass##_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); }                  \
   INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, TestClass##Params)
 
+#define TEST_IT_SPAN(fun, TestClass, ElemType, IndexType)                                        \
+  typedef LinewiseTest<ElemType, IndexType, TestClass> TestClass##Span_##ElemType##_##IndexType; \
+  TEST_P(TestClass##Span_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); }                  \
+  INSTANTIATE_TEST_SUITE_P(LinewiseOpSpan, TestClass##Span_##ElemType##_##IndexType, SpanParams)
+
+auto SpanParams = ::testing::Combine(::testing::Values(0), ::testing::Values(0));
+
 auto TinyParams = ::testing::Combine(::testing::Values(0, 1, 2, 4), ::testing::Values(0, 1, 2, 3));
 
 struct Tiny {
@@ -299,5 +437,10 @@ TEST_IT(run, Gigabyte, double, int);
 TEST_IT(run, TenGigs, float, uint64_t);
 TEST_IT(run, TenGigs, double, uint64_t);
 
+TEST_IT_SPAN(runWithPaddedSpan, Megabyte, float, int);
+TEST_IT_SPAN(runWithPaddedSpan, Megabyte, double, int);
+TEST_IT_SPAN(runWithPaddedSpan, Gigabyte, float, int);
+TEST_IT_SPAN(runWithPaddedSpan, Gigabyte, double, int);
+
 }  // namespace matrix
 }  // end namespace raft
diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
index 3931ff224c..c292feb894 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/mdarray.cu
@@ -436,6 +436,425 @@ TEST(MDArray, FuncArg)
   }
 }
 
+void test_mdspan_layout_right_padded()
+{
+  {
+    // 5x2 example,
+    constexpr int n_rows          = 2;
+    constexpr int n_cols          = 5;
+    constexpr int alignment       = 8;
+    constexpr int alignment_bytes = sizeof(int) * alignment;
+
+    int data_row_major[] = {
+      1,
+      2,
+      3,
+      4,
+      5, /* X  X  X */
+      6,
+      7,
+      8,
+      9,
+      10 /* X  X  X */
+    };
+    // manually aligning the above, using -1 as filler
+    static constexpr int X = -1;
+    int data_padded[]      = {1, 2, 3, 4, 5, X, X, X, 6, 7, 8, 9, 10, X, X, X};
+
+    using extents_type = stdex::extents<size_t, stdex::dynamic_extent, stdex::dynamic_extent>;
+    using padded_layout_row_major =
+      stdex::layout_right_padded<detail::padding<int, alignment_bytes>::value>;
+    using padded_mdspan    = stdex::mdspan<int, extents_type, padded_layout_row_major>;
+    using row_major_mdspan = stdex::mdspan<int, extents_type, stdex::layout_right>;
+
+    padded_layout_row_major::mapping<extents_type> layout{extents_type{n_rows, n_cols}};
+
+    auto padded    = padded_mdspan(data_padded, layout);
+    auto row_major = row_major_mdspan(data_row_major, n_rows, n_cols);
+
+    int failures = 0;
+    for (int irow = 0; irow < n_rows; ++irow) {
+      for (int icol = 0; icol < n_cols; ++icol) {
+        if (padded(irow, icol) != row_major(irow, icol)) { ++failures; }
+      }
+    }
+    ASSERT_EQ(failures, 0);
+  }
+}
+
+TEST(MDSpan, LayoutRightPadded) { test_mdspan_layout_right_padded(); }
+
+void test_mdarray_padding()
+{
+  using extents_type = stdex::extents<size_t, dynamic_extent, dynamic_extent>;
+  auto s             = rmm::cuda_stream_default;
+
+  {
+    constexpr int rows            = 6;
+    constexpr int cols            = 7;
+    constexpr int alignment       = 5;
+    constexpr int alignment_bytes = sizeof(int) * alignment;
+
+    /**
+     * padded device array
+     */
+    using padded_layout_row_major =
+      stdex::layout_right_padded<detail::padding<float, alignment_bytes>::value>;
+
+    using padded_mdarray_type = device_mdarray<float, extents_type, padded_layout_row_major>;
+    padded_layout_row_major::mapping<extents_type> layout(extents_type(rows, cols));
+
+    auto device_policy = padded_mdarray_type::container_policy_type{s};
+    static_assert(std::is_same_v<typename decltype(device_policy)::accessor_type,
+                                 detail::device_uvector_policy<float>>);
+    padded_mdarray_type padded_device_array{layout, device_policy};
+
+    // direct access mdarray
+    padded_device_array(0, 3) = 1;
+    ASSERT_EQ(padded_device_array(0, 3), 1);
+
+    // non-const access via mdspan
+    auto d_view = padded_device_array.view();
+    static_assert(!decltype(d_view)::accessor_type::is_host_type::value);
+
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+    thrust::for_each_n(rmm::exec_policy(s),
+                       thrust::make_counting_iterator(0ul),
+                       1,
+                       [d_view, p_status] __device__(size_t i) {
+                         if (d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); }
+                         d_view(0, 2) = 3;
+                         if (d_view(0, 2) != 3) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, s);
+
+    // const ref access via mdspan
+    auto const& arr = padded_device_array;
+    ASSERT_EQ(arr(0, 3), 1);
+    auto const_d_view = arr.view();
+    thrust::for_each_n(rmm::exec_policy(s),
+                       thrust::make_counting_iterator(0ul),
+                       1,
+                       [const_d_view, p_status] __device__(size_t i) {
+                         if (const_d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, s);
+
+    // initialize with sequence
+    thrust::for_each_n(
+      rmm::exec_policy(s),
+      thrust::make_counting_iterator(0ul),
+      rows * cols,
+      [d_view, rows, cols] __device__(size_t i) { d_view(i / cols, i % cols) = i; });
+
+    // manually create span with layout
+    {
+      auto data_padded         = padded_device_array.data_handle();
+      using padded_mdspan_type = device_mdspan<float, extents_type, padded_layout_row_major>;
+      auto padded_span         = padded_mdspan_type(data_padded, layout);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         rows * cols,
+                         [padded_span, rows, cols, p_status] __device__(size_t i) {
+                           if (padded_span(i / cols, i % cols) != i) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+    }
+
+    // utilities
+    static_assert(padded_device_array.rank_dynamic() == 2);
+    static_assert(padded_device_array.rank() == 2);
+    static_assert(padded_device_array.is_unique());
+    static_assert(padded_device_array.is_strided());
+
+    static_assert(
+      !std::is_nothrow_default_constructible<padded_mdarray_type>::value);  // cuda stream
+    static_assert(std::is_nothrow_move_constructible<padded_mdarray_type>::value);
+    static_assert(std::is_nothrow_move_assignable<padded_mdarray_type>::value);
+  }
+}
+
+TEST(MDArray, Padding) { test_mdarray_padding(); }
+
+// Test deactivated as submdspan support requires upstream changes
+/*void test_submdspan_padding()
+{
+  using extents_type = stdex::extents<dynamic_extent, dynamic_extent>;
+  auto s             = rmm::cuda_stream_default;
+
+  {
+    constexpr int rows            = 6;
+    constexpr int cols            = 7;
+    constexpr int alignment       = 5;
+    constexpr int alignment_bytes = sizeof(int) * alignment;
+
+    using layout_padded_general =
+      stdex::layout_padded_general<float, stdex::StorageOrderType::row_major_t, alignment_bytes>;
+    using padded_mdarray_type = device_mdarray<float, extents_type, layout_padded_general>;
+    using padded_mdspan_type  = device_mdspan<float, extents_type, layout_padded_general>;
+    layout_padded_general::mapping<extents_type> layout{extents_type{rows, cols}};
+
+    auto device_policy = padded_mdarray_type::container_policy_type{s};
+    static_assert(std::is_same_v<typename decltype(device_policy)::accessor_type,
+                                 detail::device_uvector_policy<float>>);
+    padded_mdarray_type padded_device_array{layout, device_policy};
+
+    // test status
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+
+    // initialize with sequence
+    {
+      auto d_view = padded_device_array.view();
+      static_assert(std::is_same_v<typename decltype(d_view)::layout_type, layout_padded_general>);
+      thrust::for_each_n(
+        rmm::exec_policy(s),
+        thrust::make_counting_iterator(0ul),
+        rows * cols,
+        [d_view, rows, cols] __device__(size_t i) { d_view(i / cols, i % cols) = i; });
+    }
+
+    // get mdspan manually from raw data
+    {
+      auto data_padded = padded_device_array.data();
+      auto padded_span = padded_mdspan_type(data_padded, layout);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         rows * cols,
+                         [padded_span, rows, cols, p_status] __device__(size_t i) {
+                           if (padded_span(i / cols, i % cols) != i) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+    }
+
+    // full subspan
+    {
+      auto padded_span  = padded_device_array.view();
+      auto subspan_full = stdex::submdspan(padded_span, stdex::full_extent, stdex::full_extent);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         cols * rows,
+                         [subspan_full, padded_span, rows, cols, p_status] __device__(size_t i) {
+                           if (subspan_full(i / cols, i % cols) != padded_span(i / cols, i % cols))
+                             myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan should still be padded
+      static_assert(
+        std::is_same_v<typename decltype(subspan_full)::layout_type, layout_padded_general>);
+    }
+
+    // slicing a row
+    {
+      auto padded_span = padded_device_array.view();
+      auto row3        = stdex::submdspan(padded_span, 3, stdex::full_extent);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         cols,
+                         [row3, padded_span, p_status] __device__(size_t i) {
+                           if (row3(i) != padded_span(3, i)) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan should still be padded
+      static_assert(std::is_same_v<typename decltype(row3)::layout_type, layout_padded_general>);
+    }
+
+    // slicing a column
+    {
+      auto padded_span = padded_device_array.view();
+      auto col1        = stdex::submdspan(padded_span, stdex::full_extent, 1);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         rows,
+                         [col1, padded_span, p_status] __device__(size_t i) {
+                           if (col1(i) != padded_span(i, 1)) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan is *NOT* padded anymore
+      static_assert(std::is_same_v<typename decltype(col1)::layout_type, stdex::layout_stride>);
+    }
+
+    // sub-rectangle of 6x7
+    {
+      auto padded_span = padded_device_array.view();
+      auto subspan =
+        stdex::submdspan(padded_span, std::make_tuple(1ul, 4ul), std::make_tuple(2ul, 5ul));
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         (rows - 1) * (cols - 2),
+                         [subspan, rows, cols, padded_span, p_status] __device__(size_t i) {
+                           size_t idx = i / (cols - 2);
+                           size_t idy = i % (cols - 2);
+                           // elements > subspan range can be accessed as well
+                           if (subspan(idx, idy) != padded_span(idx + 1, idy + 2))
+                             myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan is *NOT* padded anymore
+      static_assert(std::is_same_v<typename decltype(subspan)::layout_type, stdex::layout_stride>);
+    }
+
+    // sub-rectangle retaining padded layout
+    {
+      auto padded_span = padded_device_array.view();
+      auto subspan =
+        stdex::submdspan(padded_span, std::make_tuple(1ul, 4ul), std::make_tuple(2ul, 5ul));
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         (rows - 1) * (cols - 2),
+                         [subspan, rows, cols, padded_span, p_status] __device__(size_t i) {
+                           size_t idx = i / (cols - 2);
+                           size_t idy = i % (cols - 2);
+                           // elements > subspan range can be accessed as well
+                           if (subspan(idx, idy) != padded_span(idx + 1, idy + 2))
+                             myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan is *NOT* padded anymore
+      static_assert(std::is_same_v<typename decltype(subspan)::layout_type, stdex::layout_stride>);
+    }
+  }
+}
+
+TEST(MDSpan, SubmdspanPadding) { test_submdspan_padding(); }*/
+
+struct TestElement1 {
+  int a, b;
+};
+
+void test_mdspan_padding_by_type()
+{
+  using extents_type = stdex::extents<size_t, dynamic_extent, dynamic_extent>;
+  auto s             = rmm::cuda_stream_default;
+
+  {
+    constexpr int rows            = 6;
+    constexpr int cols            = 7;
+    constexpr int alignment_bytes = 16;
+
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+
+    // manually check strides for row major (c style) padding
+    {
+      using padded_layout_row_major = stdex::layout_right_padded<
+        detail::padding<std::remove_cv_t<std::remove_reference_t<TestElement1>>,
+                        alignment_bytes>::value>;
+
+      using padded_mdarray_type =
+        device_mdarray<TestElement1, extents_type, padded_layout_row_major>;
+      auto device_policy = padded_mdarray_type::container_policy_type{s};
+
+      padded_layout_row_major::mapping<extents_type> layout{extents_type{rows, cols}};
+      padded_mdarray_type padded_device_array{layout, device_policy};
+      int alignment_elements = detail::padding<TestElement1, alignment_bytes>::value;
+      auto padded_span       = padded_device_array.view();
+      thrust::for_each_n(
+        rmm::exec_policy(s),
+        thrust::make_counting_iterator(0ul),
+        rows * cols,
+        [rows, cols, padded_span, alignment_elements, p_status] __device__(size_t i) {
+          size_t idx = i / cols;
+          size_t idy = i % cols;
+          if ((&(padded_span(idx, idy)) - &(padded_span(0, idy))) % alignment_elements != 0)
+            myAtomicAdd(p_status, 1);
+          if ((&(padded_span(idx, idy)) - &(padded_span(idx, 0))) != idy) myAtomicAdd(p_status, 1);
+        });
+      check_status(p_status, s);
+    }
+
+    // manually check strides for col major (f style) padding
+    {
+      using padded_layout_col_major = stdex::layout_left_padded<
+        detail::padding<std::remove_cv_t<std::remove_reference_t<TestElement1>>,
+                        alignment_bytes>::value>;
+      using padded_mdarray_type =
+        device_mdarray<TestElement1, extents_type, padded_layout_col_major>;
+      auto device_policy = padded_mdarray_type::container_policy_type{s};
+
+      padded_layout_col_major::mapping<extents_type> layout{extents_type{rows, cols}};
+      padded_mdarray_type padded_device_array{layout, device_policy};
+      int alignment_elements = detail::padding<TestElement1, alignment_bytes>::value;
+      auto padded_span       = padded_device_array.view();
+      thrust::for_each_n(
+        rmm::exec_policy(s),
+        thrust::make_counting_iterator(0ul),
+        rows * cols,
+        [rows, cols, padded_span, alignment_elements, p_status] __device__(size_t i) {
+          size_t idx = i / cols;
+          size_t idy = i % cols;
+          if ((&(padded_span(idx, idy)) - &(padded_span(idx, 0))) % alignment_elements != 0)
+            myAtomicAdd(p_status, 1);
+          if ((&(padded_span(idx, idy)) - &(padded_span(0, idy))) != idx) myAtomicAdd(p_status, 1);
+        });
+      check_status(p_status, s);
+    }
+  }
+}
+
+TEST(MDSpan, MDSpanPaddingType) { test_mdspan_padding_by_type(); }
+
+void test_mdspan_aligned_matrix()
+{
+  using extents_type = stdex::extents<size_t, dynamic_extent, dynamic_extent>;
+  constexpr int rows = 2;
+  constexpr int cols = 10;
+
+  // manually aligning the above, using -1 as filler
+  static constexpr int X = -1;
+  long data_padded[]     = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  X, X, X, X, X, X,
+                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, X, X, X, X, X, X};
+
+  auto my_aligned_host_span =
+    make_host_aligned_matrix_view<long, int, layout_right_padded<long>>(data_padded, rows, cols);
+
+  int failures = 0;
+  for (int irow = 0; irow < rows; ++irow) {
+    for (int icol = 0; icol < cols; ++icol) {
+      if (my_aligned_host_span(irow, icol) != irow * cols + icol) { ++failures; }
+    }
+  }
+  ASSERT_EQ(failures, 0);
+
+  // now work with device memory
+  // use simple 1D array to allocate some space
+  auto s          = rmm::cuda_stream_default;
+  using extent_1d = stdex::extents<size_t, dynamic_extent>;
+  layout_c_contiguous::mapping<extent_1d> layout_1d{extent_1d{rows * 32}};
+  using mdarray_t    = device_mdarray<long, extent_1d, layout_c_contiguous>;
+  auto device_policy = mdarray_t::container_policy_type{s};
+  mdarray_t device_array_1d{layout_1d, device_policy};
+
+  // direct access mdarray -- initialize with above data
+  for (int i = 0; i < 32; ++i) {
+    device_array_1d(i) = data_padded[i];
+  }
+
+  auto my_aligned_device_span =
+    make_device_aligned_matrix_view<long, int, layout_right_padded<long>>(
+      device_array_1d.data_handle(), rows, cols);
+
+  thrust::device_vector<int32_t> status(1, 0);
+  auto p_status = status.data().get();
+  thrust::for_each_n(rmm::exec_policy(s),
+                     thrust::make_counting_iterator(0ul),
+                     rows * cols,
+                     [rows, cols, my_aligned_device_span, p_status] __device__(size_t i) {
+                       size_t idx = i / cols;
+                       size_t idy = i % cols;
+                       if (my_aligned_device_span(idx, idy) != i) myAtomicAdd(p_status, 1);
+                     });
+  check_status(p_status, s);
+}
+
+TEST(MDSpan, MDSpanAlignedMatrix) { test_mdspan_aligned_matrix(); }
+
 namespace {
 void test_mdarray_unravel()
 {
@@ -527,4 +946,5 @@ void test_mdarray_unravel()
 }  // anonymous namespace
 
 TEST(MDArray, Unravel) { test_mdarray_unravel(); }
+
 }  // namespace raft
diff --git a/cpp/test/spatial/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
similarity index 90%
rename from cpp/test/spatial/ann_ivf_flat.cu
rename to cpp/test/neighbors/ann_ivf_flat.cu
index 01af7ea0bd..9a430e14f2 100644
--- a/cpp/test/spatial/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -154,7 +154,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
           (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
 
-        auto index = ivf_flat::build_index(handle_, database_view, index_params);
+        auto index = ivf_flat::build(handle_, database_view, index_params);
 
         rmm::device_uvector<IdxT> vector_indices(ps.num_db_vecs, stream_);
         thrust::sequence(handle_.get_thrust_policy(),
@@ -169,20 +169,31 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
         auto index_2 = ivf_flat::extend(handle_, index, half_of_data_view);
 
+        auto new_half_of_data_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          database.data() + half_of_data * ps.dim, IdxT(ps.num_db_vecs) - half_of_data, ps.dim);
+
+        auto new_half_of_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+          vector_indices.data() + half_of_data, IdxT(ps.num_db_vecs) - half_of_data);
+
         ivf_flat::extend(handle_,
                          &index_2,
-                         database.data() + half_of_data * ps.dim,
-                         vector_indices.data() + half_of_data,
-                         IdxT(ps.num_db_vecs) - half_of_data);
-
+                         new_half_of_data_view,
+                         std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(
+                           new_half_of_data_indices_view));
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          search_queries.data(), ps.num_queries, ps.dim);
+        auto indices_out_view = raft::make_device_matrix_view<IdxT, IdxT>(
+          indices_ivfflat_dev.data(), ps.num_queries, ps.k);
+        auto dists_out_view = raft::make_device_matrix_view<T, IdxT>(
+          distances_ivfflat_dev.data(), ps.num_queries, ps.k);
         ivf_flat::search(handle_,
-                         search_params,
                          index_2,
-                         search_queries.data(),
-                         ps.num_queries,
-                         ps.k,
-                         indices_ivfflat_dev.data(),
-                         distances_ivfflat_dev.data());
+                         search_queries_view,
+                         indices_out_view,
+                         dists_out_view,
+                         search_params,
+                         ps.k);
 
         update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
         update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
diff --git a/cpp/test/spatial/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
similarity index 100%
rename from cpp/test/spatial/ann_ivf_pq.cuh
rename to cpp/test/neighbors/ann_ivf_pq.cuh
diff --git a/cpp/test/spatial/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
similarity index 100%
rename from cpp/test/spatial/ann_ivf_pq/test_float_int64_t.cu
rename to cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
diff --git a/cpp/test/spatial/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
similarity index 100%
rename from cpp/test/spatial/ann_ivf_pq/test_float_uint32_t.cu
rename to cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
diff --git a/cpp/test/spatial/ann_ivf_pq/test_float_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
similarity index 100%
rename from cpp/test/spatial/ann_ivf_pq/test_float_uint64_t.cu
rename to cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
diff --git a/cpp/test/spatial/ann_ivf_pq/test_int8_t_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
similarity index 100%
rename from cpp/test/spatial/ann_ivf_pq/test_int8_t_uint64_t.cu
rename to cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
diff --git a/cpp/test/spatial/ann_ivf_pq/test_uint8_t_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
similarity index 100%
rename from cpp/test/spatial/ann_ivf_pq/test_uint8_t_uint64_t.cu
rename to cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
diff --git a/cpp/test/spatial/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
similarity index 100%
rename from cpp/test/spatial/ann_utils.cuh
rename to cpp/test/neighbors/ann_utils.cuh
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
similarity index 96%
rename from cpp/test/spatial/ball_cover.cu
rename to cpp/test/neighbors/ball_cover.cu
index d9ad9cc358..47030b0d62 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -18,12 +18,12 @@
 #include "spatial_data.h"
 #include <raft/core/device_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ball_cover.cuh>
 #include <raft/random/make_blobs.cuh>
-#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #include <raft/util/cudart_utils.hpp>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <rmm/device_uvector.hpp>
@@ -38,10 +38,7 @@
 #include <iostream>
 #include <vector>
 
-namespace raft {
-namespace spatial {
-namespace knn {
-
+namespace raft::neighbors::ball_cover {
 using namespace std;
 
 template <typename value_idx, typename value_t>
@@ -214,9 +211,8 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs<va
 
     BallCoverIndex<value_idx, value_t, value_int, value_int> index(handle, X_view, metric);
 
-    raft::spatial::knn::rbc_build_index(handle, index);
-    raft::spatial::knn::rbc_knn_query(
-      handle, index, X2_view, d_pred_I_view, d_pred_D_view, k, true);
+    build_index(handle, index);
+    knn_query(handle, index, X2_view, d_pred_I_view, d_pred_D_view, k, true);
 
     handle.sync_stream();
     // What we really want are for the distances to match exactly. The
@@ -304,7 +300,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs<valu
 
     BallCoverIndex<value_idx, value_t> index(handle, X_view, metric);
 
-    raft::spatial::knn::rbc_all_knn_query(handle, index, d_pred_I_view, d_pred_D_view, k, true);
+    all_knn_query(handle, index, d_pred_I_view, d_pred_D_view, k, true);
 
     handle.sync_stream();
     // What we really want are for the distances to match exactly. The
@@ -365,6 +361,4 @@ INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest,
 TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); }
 TEST_P(BallCoverKNNQueryTestF, Fit) { basicTest(); }
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::neighbors::ball_cover
\ No newline at end of file
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu
similarity index 100%
rename from cpp/test/spatial/epsilon_neighborhood.cu
rename to cpp/test/neighbors/epsilon_neighborhood.cu
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/neighbors/faiss_mr.cu
similarity index 100%
rename from cpp/test/spatial/faiss_mr.cu
rename to cpp/test/neighbors/faiss_mr.cu
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
similarity index 99%
rename from cpp/test/spatial/fused_l2_knn.cu
rename to cpp/test/neighbors/fused_l2_knn.cu
index ef032ed442..b22d10bf54 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -26,7 +26,7 @@
 #include <raft/spatial/knn/knn.cuh>
 
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/neighbors/haversine.cu
similarity index 100%
rename from cpp/test/spatial/haversine.cu
rename to cpp/test/neighbors/haversine.cu
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/neighbors/knn.cu
similarity index 95%
rename from cpp/test/spatial/knn.cu
rename to cpp/test/neighbors/knn.cu
index 5807705038..eb5ecf663f 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -19,9 +19,9 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
-#include <raft/spatial/knn/brute_force.cuh>
+#include <raft/neighbors/brute_force.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <rmm/device_buffer.hpp>
@@ -32,9 +32,7 @@
 #include <iostream>
 #include <vector>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::neighbors::brute_force {
 struct KNNInputs {
   std::vector<std::vector<float>> input;
   int k;
@@ -96,7 +94,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     auto distances =
       raft::make_device_matrix_view<T, IdxT, row_major>(distances_.data(), rows_, k_);
 
-    brute_force_knn(handle, index, search, indices, distances, k_);
+    auto metric = raft::distance::DistanceType::L2Unexpanded;
+    knn(handle, index, search, indices, distances, k_, metric, std::make_optional<IdxT>(0));
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data());
@@ -197,6 +196,4 @@ TEST_P(KNNTestFuint32_t, BruteForce) { this->testBruteForce(); }
 INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestFint64_t, ::testing::ValuesIn(inputs));
 INSTANTIATE_TEST_CASE_P(KNNTest, KNNTestFuint32_t, ::testing::ValuesIn(inputs));
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::neighbors::brute_force
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/neighbors/selection.cu
similarity index 99%
rename from cpp/test/spatial/selection.cu
rename to cpp/test/neighbors/selection.cu
index 7b1f92f182..bfcfca5ead 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -25,7 +25,7 @@
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 namespace raft::spatial::selection {
diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/neighbors/spatial_data.h
similarity index 100%
rename from cpp/test/spatial/spatial_data.h
rename to cpp/test/neighbors/spatial_data.h
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/neighbors/brute_force.cu
similarity index 75%
rename from cpp/test/sparse/knn.cu
rename to cpp/test/sparse/neighbors/brute_force.cu
index 6717ba411d..8fa5e8322d 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/neighbors/brute_force.cu
@@ -17,9 +17,9 @@
 #include <cusparse_v2.h>
 #include <gtest/gtest.h>
 
-#include "../test_utils.h"
+#include "../../test_utils.h"
 #include <raft/distance/distance_types.hpp>
-#include <raft/sparse/spatial/knn.cuh>
+#include <raft/sparse/neighbors/knn.cuh>
 
 #include <raft/util/cudart_utils.hpp>
 
@@ -79,25 +79,25 @@ class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx,
 
     make_data();
 
-    raft::sparse::spatial::brute_force_knn<value_idx, value_t>(indptr.data(),
-                                                               indices.data(),
-                                                               data.data(),
-                                                               nnz,
-                                                               n_rows,
-                                                               params.n_cols,
-                                                               indptr.data(),
-                                                               indices.data(),
-                                                               data.data(),
-                                                               nnz,
-                                                               n_rows,
-                                                               params.n_cols,
-                                                               out_indices.data(),
-                                                               out_dists.data(),
-                                                               k,
-                                                               handle,
-                                                               params.batch_size_index,
-                                                               params.batch_size_query,
-                                                               params.metric);
+    raft::sparse::neighbors::brute_force_knn<value_idx, value_t>(indptr.data(),
+                                                                 indices.data(),
+                                                                 data.data(),
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 indptr.data(),
+                                                                 indices.data(),
+                                                                 data.data(),
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 out_indices.data(),
+                                                                 out_dists.data(),
+                                                                 k,
+                                                                 handle,
+                                                                 params.batch_size_index,
+                                                                 params.batch_size_query,
+                                                                 params.metric);
 
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu
similarity index 97%
rename from cpp/test/sparse/connect_components.cu
rename to cpp/test/sparse/neighbors/connect_components.cu
index 6278e7ef80..fc4eecd4ee 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/neighbors/connect_components.cu
@@ -24,8 +24,8 @@
 
 #include <raft/sparse/linalg/symmetrize.cuh>
 #include <raft/sparse/mst/mst.cuh>
+#include <raft/sparse/neighbors/knn_graph.cuh>
 #include <raft/sparse/selection/connect_components.cuh>
-#include <raft/sparse/spatial/knn_graph.cuh>
 
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/transpose.cuh>
@@ -34,7 +34,7 @@
 #include <raft/sparse/hierarchy/single_linkage.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include "../test_utils.h"
+#include "../../test_utils.h"
 
 namespace raft {
 namespace sparse {
@@ -75,13 +75,13 @@ class ConnectComponentsTest
      */
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
-    raft::sparse::spatial::knn_graph(handle,
-                                     data.data(),
-                                     params.n_row,
-                                     params.n_col,
-                                     raft::distance::DistanceType::L2SqrtExpanded,
-                                     knn_graph_coo,
-                                     params.c);
+    raft::sparse::neighbors::knn_graph(handle,
+                                       data.data(),
+                                       params.n_row,
+                                       params.n_col,
+                                       raft::distance::DistanceType::L2SqrtExpanded,
+                                       knn_graph_coo,
+                                       params.c);
 
     raft::sparse::convert::sorted_coo_to_csr(
       knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream);
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
similarity index 96%
rename from cpp/test/sparse/knn_graph.cu
rename to cpp/test/sparse/neighbors/knn_graph.cu
index 47c1819e79..d6f9e8386f 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "../test_utils.h"
+#include "../../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/spatial/knn_graph.cuh>
+#include <raft/sparse/neighbors/knn_graph.cuh>
 #if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>
 #endif
@@ -77,7 +77,7 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
 
     update_device(X.data(), params.X.data(), params.X.size(), stream);
 
-    raft::sparse::spatial::knn_graph(
+    raft::sparse::neighbors::knn_graph(
       handle, X.data(), params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out);
 
     rmm::device_scalar<value_idx> sum(stream);
diff --git a/cpp/test/stats/accuracy.cu b/cpp/test/stats/accuracy.cu
new file mode 100644
index 0000000000..192c187794
--- /dev/null
+++ b/cpp/test/stats/accuracy.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <optional>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/accuracy.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct AccuracyInputs {
+  T tolerance;
+  int nrows;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const AccuracyInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class AccuracyTest : public ::testing::TestWithParam<AccuracyInputs<T>> {
+ protected:
+  AccuracyTest() : stream(handle.get_stream()) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<AccuracyInputs<T>>::GetParam();
+    raft::random::RngState r(params.seed);
+    rmm::device_uvector<int> predictions(params.nrows, stream);
+    rmm::device_uvector<int> ref_predictions(params.nrows, stream);
+    uniformInt(handle, r, predictions.data(), params.nrows, 0, 10);
+    uniformInt(handle, r, ref_predictions.data(), params.nrows, 0, 10);
+
+    actualVal =
+      accuracy(handle,
+               raft::make_device_vector_view<const int>(predictions.data(), params.nrows),
+               raft::make_device_vector_view<const int>(ref_predictions.data(), params.nrows));
+    expectedVal = T(0);
+    std::vector<int> h_predictions(params.nrows, 0);
+    std::vector<int> h_ref_predictions(params.nrows, 0);
+    raft::update_host(h_predictions.data(), predictions.data(), params.nrows, stream);
+    raft::update_host(h_ref_predictions.data(), ref_predictions.data(), params.nrows, stream);
+
+    unsigned long long correctly_predicted = 0ULL;
+    for (int i = 0; i < params.nrows; ++i) {
+      correctly_predicted += (h_predictions[i] - h_ref_predictions[i]) == 0;
+    }
+    expectedVal = correctly_predicted * 1.0f / params.nrows;
+    raft::interruptible::synchronize(stream);
+  }
+
+ protected:
+  AccuracyInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+  T expectedVal, actualVal;
+};
+
+const std::vector<AccuracyInputs<float>> inputsf = {
+  {0.001f, 30, 1234ULL}, {0.001f, 100, 1234ULL}, {0.001f, 1000, 1234ULL}};
+typedef AccuracyTest<float> AccuracyTestF;
+TEST_P(AccuracyTestF, Result)
+{
+  auto eq = raft::CompareApprox<float>(params.tolerance);
+  ASSERT_TRUE(match(expectedVal, actualVal, eq));
+}
+INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<AccuracyInputs<double>> inputsd = {
+  {0.001, 30, 1234ULL}, {0.001, 100, 1234ULL}, {0.001, 1000, 1234ULL}};
+typedef AccuracyTest<double> AccuracyTestD;
+TEST_P(AccuracyTestD, Result)
+{
+  auto eq = raft::CompareApprox<double>(params.tolerance);
+  ASSERT_TRUE(match(expectedVal, actualVal, eq));
+}
+INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/r2_score.cu b/cpp/test/stats/r2_score.cu
new file mode 100644
index 0000000000..d77daacb04
--- /dev/null
+++ b/cpp/test/stats/r2_score.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <optional>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/r2_score.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct R2_scoreInputs {
+  T tolerance;
+  int nrows;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const R2_scoreInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class R2_scoreTest : public ::testing::TestWithParam<R2_scoreInputs<T>> {
+ protected:
+  R2_scoreTest() : stream(handle.get_stream()) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<R2_scoreInputs<T>>::GetParam();
+    raft::random::RngState r(params.seed);
+    rmm::device_uvector<T> y(params.nrows, stream);
+    rmm::device_uvector<T> y_hat(params.nrows, stream);
+    uniform(handle, r, y.data(), params.nrows, (T)-1.0, (T)1.0);
+    uniform(handle, r, y_hat.data(), params.nrows, (T)-1.0, (T)1.0);
+
+    actualVal   = r2_score(handle,
+                         raft::make_device_vector_view<const T>(y.data(), params.nrows),
+                         raft::make_device_vector_view<const T>(y_hat.data(), params.nrows));
+    expectedVal = T(0);
+    std::vector<T> h_y(params.nrows, 0);
+    std::vector<T> h_y_hat(params.nrows, 0);
+    raft::update_host(h_y.data(), y.data(), params.nrows, stream);
+    raft::update_host(h_y_hat.data(), y_hat.data(), params.nrows, stream);
+    T mean = T(0);
+    for (int i = 0; i < params.nrows; ++i) {
+      mean += h_y[i];
+    }
+    mean /= params.nrows;
+
+    std::vector<T> sse_arr(params.nrows, 0);
+    std::vector<T> ssto_arr(params.nrows, 0);
+    T sse  = T(0);
+    T ssto = T(0);
+    for (int i = 0; i < params.nrows; ++i) {
+      sse += (h_y[i] - h_y_hat[i]) * (h_y[i] - h_y_hat[i]);
+      ssto += (h_y[i] - mean) * (h_y[i] - mean);
+    }
+    expectedVal = 1.0 - sse / ssto;
+    raft::interruptible::synchronize(stream);
+  }
+
+ protected:
+  R2_scoreInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+  T expectedVal, actualVal;
+};
+
+const std::vector<R2_scoreInputs<float>> inputsf = {
+  {0.001f, 30, 1234ULL}, {0.001f, 100, 1234ULL}, {0.001f, 1000, 1234ULL}};
+typedef R2_scoreTest<float> R2_scoreTestF;
+TEST_P(R2_scoreTestF, Result)
+{
+  auto eq = raft::CompareApprox<float>(params.tolerance);
+  ASSERT_TRUE(match(expectedVal, actualVal, eq));
+}
+INSTANTIATE_TEST_CASE_P(R2_scoreTests, R2_scoreTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<R2_scoreInputs<double>> inputsd = {
+  {0.001, 30, 1234ULL}, {0.001, 100, 1234ULL}, {0.001, 1000, 1234ULL}};
+typedef R2_scoreTest<double> R2_scoreTestD;
+TEST_P(R2_scoreTestD, Result)
+{
+  auto eq = raft::CompareApprox<double>(params.tolerance);
+  ASSERT_TRUE(match(expectedVal, actualVal, eq));
+}
+INSTANTIATE_TEST_CASE_P(R2_scoreTests, R2_scoreTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/regression_metrics.cu b/cpp/test/stats/regression_metrics.cu
new file mode 100644
index 0000000000..effc3d04dd
--- /dev/null
+++ b/cpp/test/stats/regression_metrics.cu
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <optional>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/regression_metrics.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct RegressionInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const RegressionInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+void naive_reg_metrics(std::vector<T>& predictions,
+                       std::vector<T>& ref_predictions,
+                       double& mean_abs_error,
+                       double& mean_squared_error,
+                       double& median_abs_error)
+{
+  auto len        = predictions.size();
+  double abs_diff = 0;
+  double sq_diff  = 0;
+  std::vector<double> abs_errors(len);
+  for (std::size_t i = 0; i < len; ++i) {
+    auto diff = predictions[i] - ref_predictions[i];
+    abs_diff += abs(diff);
+    sq_diff += diff * diff;
+    abs_errors[i] = abs(diff);
+  }
+  mean_abs_error     = abs_diff / len;
+  mean_squared_error = sq_diff / len;
+
+  std::sort(abs_errors.begin(), abs_errors.end());
+  auto middle = len / 2;
+  if (len % 2 == 1) {
+    median_abs_error = abs_errors[middle];
+  } else {
+    median_abs_error = (abs_errors[middle] + abs_errors[middle - 1]) / 2;
+  }
+}
+
+template <typename T>
+class RegressionTest : public ::testing::TestWithParam<RegressionInputs<T>> {
+ protected:
+  RegressionTest() : stream(handle.get_stream()) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<RegressionInputs<T>>::GetParam();
+    raft::random::RngState r(params.seed);
+    rmm::device_uvector<T> predictions(params.len, stream);
+    rmm::device_uvector<T> ref_predictions(params.len, stream);
+    uniform(handle, r, predictions.data(), params.len, T(-10.0), T(10.0));
+    uniform(handle, r, ref_predictions.data(), params.len, T(-10.0), T(10.0));
+
+    regression_metrics(handle,
+                       raft::make_device_vector_view<const T>(predictions.data(), params.len),
+                       raft::make_device_vector_view<const T>(ref_predictions.data(), params.len),
+                       raft::make_host_scalar_view(&mean_abs_error),
+                       raft::make_host_scalar_view(&mean_squared_error),
+                       raft::make_host_scalar_view(&median_abs_error));
+    std::vector<T> h_predictions(params.len, 0);
+    std::vector<T> h_ref_predictions(params.len, 0);
+    raft::update_host(h_predictions.data(), predictions.data(), params.len, stream);
+    raft::update_host(h_ref_predictions.data(), ref_predictions.data(), params.len, stream);
+
+    naive_reg_metrics(h_predictions,
+                      h_ref_predictions,
+                      ref_mean_abs_error,
+                      ref_mean_squared_error,
+                      ref_median_abs_error);
+    raft::interruptible::synchronize(stream);
+  }
+
+ protected:
+  RegressionInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream           = 0;
+  double mean_abs_error         = 0;
+  double mean_squared_error     = 0;
+  double median_abs_error       = 0;
+  double ref_mean_abs_error     = 0;
+  double ref_mean_squared_error = 0;
+  double ref_median_abs_error   = 0;
+};
+
+const std::vector<RegressionInputs<float>> inputsf = {
+  {0.001f, 30, 1234ULL}, {0.001f, 100, 1234ULL}, {0.001f, 4000, 1234ULL}};
+typedef RegressionTest<float> RegressionTestF;
+TEST_P(RegressionTestF, Result)
+{
+  auto eq = raft::CompareApprox<float>(params.tolerance);
+  ASSERT_TRUE(match(ref_mean_abs_error, mean_abs_error, eq));
+  ASSERT_TRUE(match(ref_mean_squared_error, mean_squared_error, eq));
+  ASSERT_TRUE(match(ref_median_abs_error, median_abs_error, eq));
+}
+INSTANTIATE_TEST_CASE_P(RegressionTests, RegressionTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<RegressionInputs<double>> inputsd = {
+  {0.001, 30, 1234ULL}, {0.001, 100, 1234ULL}, {0.001, 4000, 1234ULL}};
+typedef RegressionTest<double> RegressionTestD;
+TEST_P(RegressionTestD, Result)
+{
+  auto eq = raft::CompareApprox<double>(params.tolerance);
+  ASSERT_TRUE(match(ref_mean_abs_error, mean_abs_error, eq));
+  ASSERT_TRUE(match(ref_mean_squared_error, mean_squared_error, eq));
+  ASSERT_TRUE(match(ref_median_abs_error, median_abs_error, eq));
+}
+INSTANTIATE_TEST_CASE_P(RegressionTests, RegressionTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index ec99d5a627..9f33855572 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -15,7 +15,9 @@
  */
 
 #include "../test_utils.h"
+#include <cstdint>
 #include <gtest/gtest.h>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/stats/weighted_mean.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -87,11 +89,23 @@ class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     thrust::host_vector<T> hexp(rows);
 
     // compute naive result & copy to GPU
-    naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
-    dexp = hexp;
-
-    // compute result
-    rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+    naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, params.row_major);
+    dexp        = hexp;
+    auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), rows);
+    auto weights =
+      raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), cols);
+
+    if (params.row_major) {
+      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+        din.data().get(), rows, cols);
+      // compute result
+      row_weighted_mean(handle, input, weights, output);
+    } else {
+      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+        din.data().get(), rows, cols);
+      // compute result
+      row_weighted_mean(handle, input, weights, output);
+    }
 
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.N;
@@ -150,12 +164,23 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     thrust::host_vector<T> hexp(cols);
 
     // compute naive result & copy to GPU
-    naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, params.row_major);
     dexp = hexp;
 
-    // compute result
-    colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
-
+    auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), cols);
+    auto weights =
+      raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), rows);
+    if (params.row_major) {
+      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+        din.data().get(), rows, cols);
+      // compute result
+      col_weighted_mean(handle, input, weights, output);
+    } else {
+      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+        din.data().get(), rows, cols);
+      // compute result
+      col_weighted_mean(handle, input, weights, output);
+    }
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.M;
   }
@@ -200,16 +225,20 @@ class WeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>>
       naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, params.row_major);
     dexp = hexp;
 
-    // compute result
-    weightedMean(dact.data().get(),
-                 din.data().get(),
-                 dweights.data().get(),
-                 cols,
-                 rows,
-                 params.row_major,
-                 params.along_rows,
-                 stream);
-
+    auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), mean_size);
+    auto weights =
+      raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), weight_size);
+    if (params.row_major) {
+      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+        din.data().get(), rows, cols);
+      // compute result
+      weighted_mean(handle, input, weights, output, params.along_rows);
+    } else {
+      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+        din.data().get(), rows, cols);
+      // compute result
+      weighted_mean(handle, input, weights, output, params.along_rows);
+    }
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.N;
   }
@@ -226,6 +255,10 @@ static const float tolF  = 128 * std::numeric_limits<float>::epsilon();
 static const double tolD = 256 * std::numeric_limits<double>::epsilon();
 
 const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234, true, true},
+                                                        {tolF, 32, 32, 1234, true, false},
+                                                        {tolF, 32, 64, 1234, false, false},
+                                                        {tolF, 32, 256, 1234, true, true},
+                                                        {tolF, 32, 256, 1234, false, false},
                                                         {tolF, 1024, 32, 1234, true, false},
                                                         {tolF, 1024, 64, 1234, true, true},
                                                         {tolF, 1024, 128, 1234, true, false},
@@ -236,6 +269,10 @@ const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234, true,
                                                         {tolF, 1024, 256, 1234, false, true}};
 
 const std::vector<WeightedMeanInputs<double>> inputsd = {{tolD, 4, 4, 1234, true, true},
+                                                         {tolD, 32, 32, 1234, true, false},
+                                                         {tolD, 32, 64, 1234, false, false},
+                                                         {tolD, 32, 256, 1234, true, true},
+                                                         {tolD, 32, 256, 1234, false, false},
                                                          {tolD, 1024, 32, 1234, true, false},
                                                          {tolD, 1024, 64, 1234, true, true},
                                                          {tolD, 1024, 128, 1234, true, false},
@@ -280,16 +317,20 @@ INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::Va
 using WeightedMeanTestF = WeightedMeanTest<float>;
 TEST_P(WeightedMeanTestF, Result)
 {
+  auto mean_size = params.along_rows ? params.M : params.N;
   ASSERT_TRUE(devArrMatch(
-    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<float>(params.tolerance)));
+    dexp.data().get(), dact.data().get(), mean_size, raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(WeightedMeanTest, WeightedMeanTestF, ::testing::ValuesIn(inputsf));
 
 using WeightedMeanTestD = WeightedMeanTest<double>;
 TEST_P(WeightedMeanTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<double>(params.tolerance)));
+  auto mean_size = params.along_rows ? params.M : params.N;
+  ASSERT_TRUE(devArrMatch(dexp.data().get(),
+                          dact.data().get(),
+                          mean_size,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(WeightedMeanTest, WeightedMeanTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/docs/README.md b/docs/README.md
index ced8e63938..a09ccf41eb 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,7 +1,7 @@
 # Building Documentation
 ## Building locally:
 
-#### [Build and install RAFT](../BUILD.md)
+#### [Build and install RAFT](source/build.md)
 
 #### Generate the docs
 ```shell script
diff --git a/BUILD.md b/docs/source/build.md
similarity index 86%
rename from BUILD.md
rename to docs/source/build.md
index c94bb24204..b75e67d82f 100644
--- a/BUILD.md
+++ b/docs/source/build.md
@@ -1,38 +1,21 @@
-# RAFT Build and Development Guide
-
-- [Building and installing RAFT](#build_install)
-    - [CUDA/GPU Requirements](#cuda_gpu_req)
-    - [Build Dependencies](#required_depenencies)
-    - [Header-only C++](#install_header_only_cpp)
-    - [C++ Shared Libraries](#shared_cpp_libs)
-    - [Improving Rebuild Times](#ccache)
-    - [Googletests](#gtests)
-    - [Googlebench](#gbench)
-    - [C++ Using Cmake](#cpp_using_cmake)
-    - [Python](#python)
-    - [Documentation](#docs)
-- [Using RAFT in downstream projects](#use_raft)
-    - [Cmake Header-only Integration](#cxx_integration)
-    - [Using Shared Libraries in Cmake](#use_shared_libs)
-    - [Building RAFT C++ from source](#build_cxx_source)
-    - [Python/Cython Integration](#py_integration)
-
-## <a id="build_install"></a>Building and installing RAFT
-
-### <a id="cuda_gpu_req"></a>CUDA/GPU Requirements
+# Install Guide
+
+## Building and installing RAFT
+
+### CUDA/GPU Requirements
 - CUDA Toolkit 11.0+
 - NVIDIA driver 450.80.02+
 - Pascal architecture of better (compute capability >= 6.0)
 
-### <a id="required_dependencies"></a>Build Dependencies
+### Build Dependencies
 
 In addition to the libraries included with cudatoolkit 11.0+, there are some other dependencies below for building RAFT from source. Many of the dependencies are optional and depend only on the primitives being used. All of these can be installed with cmake or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm) and many of them can be installed with [conda](https://anaconda.org).
 
 #### Required
 - [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version.
+- [Thrust](https://github.com/NVIDIA/thrust) v1.17 / [CUB](https://github.com/NVIDIA/cub)
 
 #### Optional
-- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub) - On by default but can be disabled.
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API.
 - [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
 - [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests.
@@ -46,7 +29,7 @@ C++ RAFT is a header-only library but provides the option of building shared lib
 
 The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the headers, tests, benchmarks, and individual shared libraries.
 
-### <a id="install_header_only_cpp"></a>Header-only C++
+### Header-only C++
 
 `build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
 
@@ -55,7 +38,7 @@ The following example will download the needed dependencies and install the RAFT
 ./build.sh libraft --install
 ```
 
-### <a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
+### C++ Shared Libraries (optional)
 
 For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
 ```bash
@@ -69,7 +52,7 @@ Individual shared libraries have their own flags and multiple can be used (thoug
 
 Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`.
 
-### <a id="ccache"></a>`ccache` and `sccache`
+### ccache and sccache
 
 `ccache` and `sccache` can be used to better cache parts of the build when rebuilding frequently, such as when working on a new feature. You can also use `ccache` or `sccache` with `build.sh`:
 
@@ -77,7 +60,7 @@ Add the `--install` flag to the above example to also install the shared librari
 ./build.sh libraft --cache-tool=ccache
 ```
 
-### <a id="gtests"></a>Tests
+### Tests
 
 Compile the tests using the `tests` target in `build.sh`.
 
@@ -101,23 +84,23 @@ For example, to run the distance tests:
 It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`:
 
 ```bash
-./build.sh libraft tests --limit-tests=SPATIAL_TEST;DISTANCE_TEST;MATRIX_TEST
+./build.sh libraft tests --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST
 ```
 
-### <a id="gbench"></a>Benchmarks
+### Benchmarks
 
 The benchmarks are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_BENCH`.
 ```bash
 ./build.sh libraft bench
 ```
 
-It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`:
+It can take sometime to compile all of the benchmarks. You can build individual benchmarks by providing a semicolon-separated list to the `--limit-bench` option in `build.sh`:
 
 ```bash
-./build.sh libraft bench --limit-bench=SPATIAL_BENCH;DISTANCE_BENCH;LINALG_BENCH
+./build.sh libraft bench --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH
 ```
 
-### <a id="cpp_using_cmake"></a>C++ Using Cmake
+### C++ Using Cmake
 
 Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet below will install it into the current conda environment:
 ```bash
@@ -139,7 +122,6 @@ RAFT's cmake has the following configurable flags available:.
 | RAFT_COMPILE_NN_LIBRARY | ON, OFF | OFF | Compiles the `libraft-nn` shared library |
 | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
-| RAFT_ENABLE_thrust_DEPENDENCY | ON, OFF | ON | Enables the Thrust dependency. This can be disabled when using many simple utilities or to override with a different Thrust version. |
 | RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` |
 | RAFT_STATIC_LINK_LIBRARIES | ON, OFF | ON | Build static link libraries instead of shared libraries |
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
@@ -150,7 +132,7 @@ RAFT's cmake has the following configurable flags available:.
 
 Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
 
-### <a id="python"></a>Python
+### Python
 
 Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
 
@@ -189,9 +171,9 @@ cd python/pylibraft
 py.test -s -v
 ```
 
-### <a id="docs"></a>Documentation
+### Documentation
 
-The documentation requires that the C++ headers and python packages have been built and installed. 
+The documentation requires that the C++ headers and python packages have been built and installed.
 
 The following will build the docs along with the C++ and Python packages:
 
@@ -201,11 +183,11 @@ The following will build the docs along with the C++ and Python packages:
 
 
 
-## <a id="use_raft"></a>Using RAFT in downstream projects
+## Using RAFT in downstream projects
 
 There are two different strategies for including RAFT in downstream projects, depending on whether or not the required dependencies are already installed and available on the `lib` and `include` paths.
 
-### <a id="cxx_integration"></a>C++ header-only integration using cmake
+### C++ header-only integration using cmake
 
 When the needed [build dependencies](#required_depenencies) are already satisfied, RAFT can be trivially integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
 ```cmake
@@ -222,7 +204,7 @@ set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/raft/cpp/include CACHE STRING "RAFT include
 
 If RAFT has already been installed, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target if using RAFT to interact only with the public APIs of consuming projects.
 
-### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
+### Using pre-compiled shared libraries
 
 Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
@@ -234,7 +216,7 @@ The following example tells the compiler to ignore the pre-compiled templates fo
 #include <raft/distance/specializations.cuh>
 ```
 
-### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
+### Building RAFT C++ from source in cmake
 
 RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake).
 
@@ -324,6 +306,6 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 
 If using the nearest neighbors APIs without the shared libraries, set `ENABLE_NN_DEPENDENCIES=ON` and keep `USE_NN_LIBRARY=OFF`
 
-### <a id="py_integration"></a>Python/Cython Integration
+### Python/Cython Integration
 
 Once installed, RAFT's Python library can be added to downstream conda recipes, imported and used directly.
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index db139031a2..cf3829422d 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -1,6 +1,7 @@
-~~~~~~~~~~~~~~~~~~~~~~
-RAFT C++ API Reference
-~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~
+C++ API Reference
+~~~~~~~~~~~~~~~~~
+
 
 
 .. _api:
@@ -9,11 +10,13 @@ RAFT C++ API Reference
    :maxdepth: 4
 
    cpp_api/core.rst
-   cpp_api/clustering.rst
+   cpp_api/cluster.rst
+   cpp_api/distance.rst
    cpp_api/linalg.rst
    cpp_api/matrix.rst
-   cpp_api/optimization.rst
+   cpp_api/mdspan.rst
+   cpp_api/neighbors.rst
+   cpp_api/solver.rst
    cpp_api/random.rst
-   cpp_api/spatial.rst
    cpp_api/sparse.rst
    cpp_api/stats.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/cluster.rst b/docs/source/cpp_api/cluster.rst
new file mode 100644
index 0000000000..90c430ace9
--- /dev/null
+++ b/docs/source/cpp_api/cluster.rst
@@ -0,0 +1,28 @@
+Cluster
+=======
+
+This page provides C++ class references for the publicly-exposed elements of the `raft/cluster` headers. RAFT provides
+fundamental clustering algorithms which are, themselves, considered reusable building blocks for other algorithms.
+
+K-Means
+-------
+
+.. doxygennamespace:: raft::cluster::kmeans
+    :project: RAFT
+    :members:
+
+
+Hierarchical Clustering
+-----------------------
+
+.. doxygennamespace:: raft::cluster::hierarchy
+    :project: RAFT
+    :members:
+
+
+Spectral Clustering
+-------------------
+
+.. doxygennamespace:: raft::spectral
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/clustering.rst b/docs/source/cpp_api/clustering.rst
deleted file mode 100644
index 90ca786cc1..0000000000
--- a/docs/source/cpp_api/clustering.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Clustering
-==========
-
-This page provides C++ class references for the publicly-exposed elements of the clustering package.
-
-.. doxygennamespace:: raft::cluster
-    :project: RAFT
-    :members:
-
-.. doxygennamespace:: raft::spectral
-    :project: RAFT
-    :members:
\ No newline at end of file
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index ef6270556e..9e4ef412f7 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -1,9 +1,10 @@
 Core
 ====
 
-This page provides C++ class references for the publicly-exposed elements of the core package.
-
-
+This page provides C++ class references for the publicly-exposed elements of the `raft/core` package. The `raft/core` headers
+require minimal dependencies, can be compiled without `nvcc`, and thus are safe to expose on your own public APIs. Aside from
+the headers in the `raft/core` include directory, any headers in the codebase with the suffix `_types.hpp` are also safe to
+expose in public APIs.
 
 handle_t
 ########
@@ -13,26 +14,25 @@ handle_t
     :members:
 
 
-interruptible
+Interruptible
 #############
 
 .. doxygenclass:: raft::interruptible
     :project: RAFT
     :members:
 
+NVTX
+####
 
-mdarray
-#######
-
-.. doxygenclass:: raft::mdarray
+.. doxygennamespace:: raft::common::nvtx
     :project: RAFT
     :members:
 
 
-span
-####
+Key-Value Pair
+##############
 
-.. doxygenclass:: raft::span
+.. doxygenstruct:: raft::KeyValuePair
     :project: RAFT
     :members:
 
diff --git a/docs/source/cpp_api/distance.rst b/docs/source/cpp_api/distance.rst
new file mode 100644
index 0000000000..2596361f6a
--- /dev/null
+++ b/docs/source/cpp_api/distance.rst
@@ -0,0 +1,11 @@
+Distance
+========
+
+This page provides C++ class references for the publicly-exposed elements of the `raft/distance` package. RAFT's
+distances have been highly optimized and support a wide assortment of different distance measures.
+
+Distance
+########
+
+.. doxygennamespace:: raft::distance
+    :project: RAFT
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index f9986fd2ce..5664e5b3dc 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -1,7 +1,10 @@
 Linear Algebra
 ==============
 
-This page provides C++ class references for the publicly-exposed elements of the (dense) linear algebra package.
+This page provides C++ class references for the publicly-exposed elements of the `raft/linalg` (dense) linear algebra headers.
+In addition to providing highly optimized arithmetic and matrix/vector operations, RAFT provides a consistent user experience
+by providing common BLAS routines, standard linear system solvers, factorization and eigenvalue solvers. Some of these routines
+hide the complexities of lower-level C-based libraries provided in the CUDA toolkit 
 
 .. doxygennamespace:: raft::linalg
     :project: RAFT
diff --git a/docs/source/cpp_api/matrix.rst b/docs/source/cpp_api/matrix.rst
index 65534aa6ee..945658eb7b 100644
--- a/docs/source/cpp_api/matrix.rst
+++ b/docs/source/cpp_api/matrix.rst
@@ -1,7 +1,8 @@
 Matrix
 ======
 
-This page provides C++ class references for the publicly-exposed elements of the matrix package.
+This page provides C++ class references for the publicly-exposed elements of the `raft/matrix` headers. The `raft/matrix`
+headers cover many operations on matrices that are otherwise not covered by `raft/linalg`.
 
 .. doxygennamespace:: raft::matrix
     :project: RAFT
diff --git a/docs/source/cpp_api/mdspan.rst b/docs/source/cpp_api/mdspan.rst
new file mode 100644
index 0000000000..a283da967b
--- /dev/null
+++ b/docs/source/cpp_api/mdspan.rst
@@ -0,0 +1,344 @@
+Multi-dimensional Span / Array
+==============================
+
+This page provides C++ class references for the RAFT's 1d span and multi-dimension owning (mdarray) and non-owning (mdspan) APIs. These headers can be found in the `raft/core` directory.
+
+Representation
+##############
+
+.. doxygenstruct:: raft::host_device_accessor
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::host_accessor
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_accessor
+    :project: RAFT
+
+.. doxygentypedef:: raft::managed_accessor
+    :project: RAFT
+
+.. doxygentypedef:: raft::row_major
+    :project: RAFT
+
+.. doxygentypedef:: raft::col_major
+    :project: RAFT
+
+.. doxygentypedef:: raft::matrix_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::vector_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::scalar_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::extent_3d
+    :project: RAFT
+
+.. doxygentypedef:: raft::extent_4d
+    :project: RAFT
+
+.. doxygentypedef:: raft::extent_5d
+    :project: RAFT
+
+.. doxygentypedef:: raft::dynamic_extent
+    :project: RAFT
+
+.. doxygentypedef:: raft::extents
+    :project: RAFT
+
+.. doxygenfunction:: raft::flatten
+    :project: RAFT
+
+
+.. doxygenfunction:: raft::reshape
+    :project: RAFT
+
+
+mdarray
+#######
+
+.. doxygenclass:: raft::mdarray
+    :project: RAFT
+    :members:
+
+.. doxygenclass:: raft::array_interface
+    :project: RAFT
+    :members:
+
+.. doxygenstruct:: raft::is_array_interface
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_array_interface_t
+    :project RAFT
+
+Device Vocabulary
+-----------------
+
+.. doxygentypedef:: raft::device_mdarray
+    :project: RAFT
+
+
+.. doxygentypedef:: raft::device_matrix
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_vector
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_scalar
+    :project: RAFT
+
+
+Device Factories
+----------------
+
+.. doxygenfunction:: raft::make_device_matrix
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_vector
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar
+    :project: RAFT
+
+
+Host Vocabulary
+---------------
+
+.. doxygentypedef:: raft::host_matrix
+    :project: RAFT
+
+.. doxygentypedef:: raft::host_vector
+    :project: RAFT
+
+.. doxygentypedef:: raft::host_scalar
+    :project: RAFT
+
+
+Host Factories
+--------------
+
+.. doxygenfunction:: raft::make_host_matrix
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_host_vector
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar
+    :project: RAFT
+
+mdspan
+######
+
+.. doxygentypedef:: raft::mdspan
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_mdspan_t
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_input_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_input_mdspan_t
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_output_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_output_mdspan_t
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_mdspan
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_mdspan
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_extents
+    :project: RAFT
+
+.. doxygenfunction:: raft::unravel_index
+    :project: RAFT
+
+
+Device Vocabulary
+-----------------
+
+.. doxygentypedef:: raft::device_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_device_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_device_mdspan_t
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_input_device_mdspan_t
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_output_device_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_device_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_device_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_device_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::device_matrix_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::device_vector_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::device_scalar_view
+   :project: RAFT
+
+
+Device Factories
+----------------
+
+.. doxygenfunction:: raft::make_device_mdspan
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_matrix_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_vector_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar_view
+   :project: RAFT
+
+
+Managed Vocabulary
+------------------
+
+..doxygentypedef:: raft::managed_mdspan
+  :project: RAFT
+
+.. doxygenstruct:: raft::is_managed_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_managed_mdspan_t
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_input_managed_mdspan_t
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_output_managed_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_managed_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_managed_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_managed_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::managed_matrix_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::managed_vector_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::managed_scalar_view
+   :project: RAFT
+
+
+Managed Factories
+-----------------
+
+.. doxygenfunction:: raft::make_managed_mdspan
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_managed_matrix_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_managed_vector_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_managed_scalar_view
+   :project: RAFT
+
+
+Host Vocabulary
+---------------
+
+.. doxygentypedef:: raft::host_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_host_mdspan
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_host_mdspan_t
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_input_host_mdspan_t
+   :project: RAFT
+
+.. doxygenstruct:: raft::is_output_host_mdspan_t
+   :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_host_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_host_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_host_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::host_matrix_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::host_vector_view
+   :project: RAFT
+
+.. doxygentypedef:: raft::host_scalar_view
+   :project: RAFT
+
+Host Factories
+--------------
+
+.. doxygenfunction:: raft::make_host_matrix_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_host_vector_view
+    :project: RAFT
+
+.. doxygenfunction:: raft::make_device_scalar_view
+    :project: RAFT
+
+span
+####
+
+.. doxygentypedef:: raft::device_span
+   :project: RAFT
+
+.. doxygentypedef:: raft::host_span
+   :project: RAFT
+
+.. doxygenclass:: raft::span
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
new file mode 100644
index 0000000000..962bbd1efe
--- /dev/null
+++ b/docs/source/cpp_api/neighbors.rst
@@ -0,0 +1,43 @@
+Neighbors
+=========
+
+This page provides C++ class references for the publicly-exposed elements of the neighbors package.
+
+
+Brute-force
+-----------
+
+.. doxygennamespace:: raft::neighbors::brute_force
+    :project: RAFT
+
+
+IVF-Flat
+--------
+
+.. doxygennamespace:: raft::neighbors::ivf_flat
+    :project: RAFT
+    :members:
+
+
+IVF-PQ
+--------
+
+.. doxygennamespace:: raft::neighbors::ivf_pq
+    :project: RAFT
+    :members:
+
+
+Epsilon Neighborhood
+--------------------
+
+.. doxygennamespace:: raft::neighbors::epsilon_neighborhood
+    :project: RAFT
+    :members:
+
+
+Random Ball Cover
+-----------------
+
+.. doxygennamespace:: raft::neighbors::ball_cover
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/optimization.rst b/docs/source/cpp_api/solver.rst
similarity index 54%
rename from docs/source/cpp_api/optimization.rst
rename to docs/source/cpp_api/solver.rst
index 75cec2494e..f7ca244dc8 100644
--- a/docs/source/cpp_api/optimization.rst
+++ b/docs/source/cpp_api/solver.rst
@@ -1,19 +1,18 @@
-Optimization
-============
+Solvers
+=======
 
-This page provides C++ class references for the publicly-exposed elements of the optimization package.
+This page provides C++ class references for the publicly-exposed elements of the iterative and combinatorial solvers package.
 
 
 Linear Assignment Problem
 #########################
 
-.. doxygenclass:: raft::lap::LinearAssignmentProblem
+.. doxygenclass:: raft::solver::LinearAssignmentProblem
     :project: RAFT
     :members:
 
 Minimum Spanning Tree
 #####################
 
-.. doxygennamespace:: raft::mst
+.. doxygenfunction:: raft::sparse::solver::mst
     :project: RAFT
-    :members:
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index c0ea61c6f7..a7c32cc65d 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -4,7 +4,6 @@ Sparse
 This page provides C++ class references for the publicly-exposed elements of the sparse package.
 
 
-
 Conversion
 ##########
 
@@ -26,20 +25,16 @@ Linear Algebra
     :project: RAFT
     :members:
 
-Misc Operations
-###############
+Matrix Operations
+#################
 
 .. doxygennamespace:: raft::sparse::op
     :project: RAFT
     :members:
 
-Selection
-#########
-
-.. doxygennamespace:: raft::sparse::selection
-    :project: RAFT
-    :members:
+Nearest Neighbors
+#################
 
-.. doxygennamespace:: raft::linkage
+.. doxygennamespace:: raft::sparse::neighbors
     :project: RAFT
     :members:
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
deleted file mode 100644
index 9bda00dab7..0000000000
--- a/docs/source/cpp_api/spatial.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-Spatial
-=======
-
-This page provides C++ class references for the publicly-exposed elements of the spatial package.
-
-Distance
-########
-
-.. doxygennamespace:: raft::distance
-    :project: RAFT
-
-
-Nearest Neighbors
-#################
-
-.. doxygenfunction:: raft::spatial::knn::brute_force_knn
-    :project: RAFT
-
-.. doxygenfunction:: raft::spatial::knn::select_k
-    :project: RAFT
-
-.. doxygenfunction:: raft::spatial::knn::knn_merge_parts
-    :project: RAFT
-
-
-IVF-Flat
---------
-
-.. doxygennamespace:: raft::spatial::knn::ivf_flat
-    :project: RAFT
-    :members:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0d7ab295f4..c46f08aac6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,15 +1,49 @@
 Welcome to RAFT's documentation!
 =================================
 
-RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning.
+RAFT contains fundamental widely-used algorithms and primitives for scientific computing, data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
+
+By taking a primitives-based approach to algorithm development, RAFT
+
+- accelerates algorithm construction time
+- reduces the maintenance burden by maximizing reuse across projects, and
+- centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
+
+
+While not exhaustive, the following general categories help summarize the accelerated building blocks that RAFT contains:
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+
+   * - Category
+     - Examples
+   * - Data Formats
+     - sparse & dense, conversions, data generation
+   * - Dense Operations
+     - linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems
+   * - Sparse Operations
+     - linear algebra, arithmetic, eigenvalue problems, slicing, symmetrization, components & labeling
+   * - Spatial
+     - pairwise distances, nearest neighbors, neighborhood graph construction
+   * - Basic Clustering
+     - spectral clustering, hierarchical clustering, k-means
+   * - Solvers
+     - combinatorial optimization, iterative solvers
+   * - Statistics
+     - sampling, moments and summary statistics, metrics
+   * - Tools & Utilities
+     - common utilities for developing CUDA applications, multi-node multi-gpu infrastructure
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
+   quick_start.md
+   build.md
    cpp_api.rst
-   raft_dask_api.rst
    pylibraft_api.rst
+   raft_dask_api.rst
 
 
 Indices and tables
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
new file mode 100644
index 0000000000..e73f9b8a7a
--- /dev/null
+++ b/docs/source/quick_start.md
@@ -0,0 +1,128 @@
+# Quick Start
+
+This guide is meant to provide a quick-start tutorial for interacting with RAFT's C++ APIs.
+
+## RAPIDS Memory Manager (RMM)
+
+RAFT relies heavily on the [RMM](https://github.com/rapidsai/rmm) library which eases the burden of configuring different allocation strategies globally across the libraries that use it.
+
+## Multi-dimensional Spans and Arrays
+
+The APIs in RAFT currently accept raw pointers to device memory and we are in the process of simplifying the APIs with the [mdspan](https://arxiv.org/abs/2010.06474) multi-dimensional array view for representing data in higher dimensions similar to the `ndarray` in the Numpy Python library. RAFT also contains the corresponding owning `mdarray` structure, which simplifies the allocation and management of multi-dimensional data in both host and device (GPU) memory.
+
+The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT using a number of different helper functions:
+
+```c++
+#include <raft/core/device_mdarray.hpp>
+
+int n_rows = 10;
+int n_cols = 10;
+
+auto scalar = raft::make_device_scalar<float>(handle, 1.0);
+auto vector = raft::make_device_vector<float>(handle, n_cols);
+auto matrix = raft::make_device_matrix<float>(handle, n_rows, n_cols);
+```
+
+The `mdspan` is a lightweight non-owning view that can wrap around any pointer, maintaining shape, layout, and indexing information for accessing elements. 
+
+
+We can construct `mdspan` instances directly from the above `mdarray` instances:
+
+```c++
+// Scalar mdspan on device
+auto scalar_view = scalar.view();
+
+// Vector mdspan on device
+auto vector_view = vector.view();
+
+// Matrix mdspan on device
+auto matrix_view = matrix.view();
+```
+Since the `mdspan` is just a lightweight wrapper, we can also construct it from the underlying data handles in the `mdarray` instances above. We use the extent to get information about the `mdarray` or `mdspan`'s shape.
+
+```c++
+#include <raft/core/device_mdspan.hpp>
+
+auto scalar_view = raft::make_device_scalar_view(scalar.data_handle());
+auto vector_view = raft::make_device_vector_view(vector.data_handle(), vector.extent(0));
+auto matrix_view = raft::make_device_matrix_view(matrix.data_handle(), matrix.extent(0), matrix.extent(1));
+```
+
+Of course, RAFT's `mdspan`/`mdarray` APIs aren't just limited to the `device`. You can also create `host` variants:
+
+```c++
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+
+int n_rows = 10;
+int n_cols = 10;
+
+auto scalar = raft::make_host_scalar<float>(handle, 1.0);
+auto vector = raft::make_host_vector<float>(handle, n_cols);
+auto matrix = raft::make_host_matrix<float>(handle, n_rows, n_cols);
+
+auto scalar_view = raft::make_host_scalar_view(scalar.data_handle());
+auto vector_view = raft::make_host_vector_view(vector.data_handle(), vector.extent(0));
+auto matrix_view = raft::make_host_matrix_view(matrix.data_handle(), matrix.extent(0), matrix.extent(1));
+```
+
+And `managed` variants:
+
+```c++
+#include <raft/core/device_mdspan.hpp>
+
+int n_rows = 10;
+int n_cols = 10;
+
+auto matrix = raft::make_managed_mdspan(managed_ptr, raft::make_matrix_extents(n_rows, n_cols));
+```
+
+
+## C++ Example
+
+Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
+
+The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
+pairwise Euclidean distances:
+
+```c++
+#include <raft/core/handle.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/distance/distance.cuh>
+
+raft::handle_t handle;
+
+int n_samples = 5000;
+int n_features = 50;
+
+auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
+auto labels = raft::make_device_vector<int>(handle, n_samples);
+auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
+
+raft::random::make_blobs(handle, input.view(), labels.view());
+
+auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
+```
+
+## Python Example
+
+The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The package is currently limited to pairwise distances and RMAT graph generation, but we will continue adding more in future releases.
+
+The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function. Note that CuPy is not a required dependency for `pylibraft`.
+
+```python
+import cupy as cp
+
+from pylibraft.distance import pairwise_distance
+
+n_samples = 5000
+n_features = 50
+
+in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+
+pairwise_distance(in1, in2, output, metric="euclidean")
+```
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index 707ea737b3..d074171e58 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -13,7 +13,8 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_sources pairwise_distance.pyx)
+set(cython_sources pairwise_distance.pyx
+                   fused_l2_nn.pyx)
 set(linked_libraries raft::raft raft::distance)
 
 # Build all of the Cython targets
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index ca3e6c5a2e..a3c4e2229b 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 #
 
+from .fused_l2_nn import fused_l2_nn_argmin
 from .pairwise_distance import distance as pairwise_distance
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
new file mode 100644
index 0000000000..7abc32119b
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -0,0 +1,161 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+
+from libcpp cimport bool
+from .distance_type cimport DistanceType
+from pylibraft.common import Handle
+from pylibraft.common.handle cimport handle_t
+
+
+def is_c_cont(cai, dt):
+    return "strides" not in cai or \
+        cai["strides"] is None or \
+        cai["strides"][1] == dt.itemsize
+
+
+cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
+        namespace "raft::distance::runtime":
+
+    void fused_l2_nn_min_arg(
+        const handle_t &handle,
+        int* min,
+        const float* x,
+        const float* y,
+        int m,
+        int n,
+        int k,
+        bool sqrt)
+
+    void fused_l2_nn_min_arg(
+        const handle_t &handle,
+        int* min,
+        const double* x,
+        const double* y,
+        int m,
+        int n,
+        int k,
+        bool sqrt)
+
+
+def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
+    """
+    Compute the 1-nearest neighbors between X and Y using the L2 distance
+
+    Parameters
+    ----------
+
+    X : CUDA array interface compliant matrix shape (m, k)
+    Y : CUDA array interface compliant matrix shape (n, k)
+    output : Writable CUDA array interface matrix shape (m, 1)
+    handle : Optional RAFT handle for reusing expensive CUDA resources
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.common import Handle
+        from pylibraft.distance import fused_l2_nn
+
+        n_samples = 5000
+        n_clusters = 5
+        n_features = 50
+
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        in2 = cp.random.random_sample((n_clusters, n_features),
+                                      dtype=cp.float32)
+        output = cp.empty((n_samples, 1), dtype=cp.int32)
+
+        # A single RAFT handle can optionally be reused across
+        # pylibraft functions.
+        handle = Handle()
+        ...
+        fused_l2_nn_argmin(in1, in2, output, handle=handle)
+        ...
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
+   """
+
+    x_cai = X.__cuda_array_interface__
+    y_cai = Y.__cuda_array_interface__
+    output_cai = output.__cuda_array_interface__
+
+    m = x_cai["shape"][0]
+    n = y_cai["shape"][0]
+
+    x_k = x_cai["shape"][1]
+    y_k = y_cai["shape"][1]
+
+    if x_k != y_k:
+        raise ValueError("Inputs must have same number of columns. "
+                         "a=%s, b=%s" % (x_k, y_k))
+
+    x_ptr = <uintptr_t>x_cai["data"][0]
+    y_ptr = <uintptr_t>y_cai["data"][0]
+
+    d_ptr = <uintptr_t>output_cai["data"][0]
+
+    handle = handle if handle is not None else Handle()
+    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+
+    x_dt = np.dtype(x_cai["typestr"])
+    y_dt = np.dtype(y_cai["typestr"])
+    d_dt = np.dtype(output_cai["typestr"])
+
+    x_c_contiguous = is_c_cont(x_cai, x_dt)
+    y_c_contiguous = is_c_cont(y_cai, y_dt)
+
+    if x_c_contiguous != y_c_contiguous:
+        raise ValueError("Inputs must have matching strides")
+
+    if x_dt != y_dt:
+        raise ValueError("Inputs must have the same dtypes")
+    if d_dt != np.int32:
+        raise ValueError("Output array must be int32")
+
+    if x_dt == np.float32:
+        fused_l2_nn_min_arg(deref(h),
+                            <int*> d_ptr,
+                            <float*> x_ptr,
+                            <float*> y_ptr,
+                            <int>m,
+                            <int>n,
+                            <int>x_k,
+                            <bool>sqrt)
+    elif x_dt == np.float64:
+        fused_l2_nn_min_arg(deref(h),
+                            <int*> d_ptr,
+                            <double*> x_ptr,
+                            <double*> y_ptr,
+                            <int>m,
+                            <int>n,
+                            <int>x_k,
+                            <bool>sqrt)
+    else:
+        raise ValueError("dtype %s not supported" % x_dt)
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 8d55402e23..0f7626e8d1 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -25,6 +25,8 @@ from cython.operator cimport dereference as deref
 
 from libcpp cimport bool
 from .distance_type cimport DistanceType
+
+from pylibraft.common import Handle
 from pylibraft.common.handle cimport handle_t
 
 
@@ -88,7 +90,7 @@ SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product",
                        "hamming", "jensenshannon", "cosine", "sqeuclidean"]
 
 
-def distance(X, Y, dists, metric="euclidean", p=2.0):
+def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
     """
     Compute pairwise distances between X and Y
 
@@ -106,6 +108,7 @@ def distance(X, Y, dists, metric="euclidean", p=2.0):
     dists : Writable CUDA array interface matrix shape (m, n)
     metric : string denoting the metric type (default="euclidean")
     p : metric parameter (currently used only for "minkowski")
+    handle : Optional RAFT handle for reusing expensive CUDA resources
 
     Examples
     --------
@@ -114,6 +117,7 @@ def distance(X, Y, dists, metric="euclidean", p=2.0):
 
         import cupy as cp
 
+        from pylibraft.common import Handle
         from pylibraft.distance import pairwise_distance
 
         n_samples = 5000
@@ -125,7 +129,15 @@ def distance(X, Y, dists, metric="euclidean", p=2.0):
                                       dtype=cp.float32)
         output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
-        pairwise_distance(in1, in2, output, metric="euclidean")
+        # A single RAFT handle can optionally be reused across
+        # pylibraft functions.
+        handle = Handle()
+        ...
+        pairwise_distance(in1, in2, output, metric="euclidean", handle=handle)
+        ...
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
    """
 
     x_cai = X.__cuda_array_interface__
@@ -146,7 +158,8 @@ def distance(X, Y, dists, metric="euclidean", p=2.0):
     y_ptr = <uintptr_t>y_cai["data"][0]
     d_ptr = <uintptr_t>dists_cai["data"][0]
 
-    cdef handle_t *h = new handle_t()
+    handle = handle if handle is not None else Handle()
+    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
 
     x_dt = np.dtype(x_cai["typestr"])
     y_dt = np.dtype(y_cai["typestr"])
diff --git a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
index ea28357201..cef19295ac 100644
--- a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
+++ b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
@@ -22,6 +22,7 @@ import numpy as np
 
 from libc.stdint cimport uintptr_t, int64_t
 from cython.operator cimport dereference as deref
+from pylibraft.common import Handle
 from pylibraft.common.handle cimport handle_t
 from .rng_state cimport RngState
 
@@ -72,7 +73,7 @@ cdef extern from "raft_distance/random/rmat_rectangular_generator.hpp" \
                                    RngState& r)
 
 
-def rmat(out, theta, r_scale, c_scale, seed=12345):
+def rmat(out, theta, r_scale, c_scale, seed=12345, handle=None):
     """
     Generate RMAT adjacency list based on the input distribution.
 
@@ -87,6 +88,7 @@ def rmat(out, theta, r_scale, c_scale, seed=12345):
     r_scale: log2 of number of source nodes
     c_scale: log2 of number of destination nodes
     seed: random seed used for reproducibility
+    handle : Optional RAFT handle for reusing expensive CUDA resources
 
     Examples
     --------
@@ -95,6 +97,7 @@ def rmat(out, theta, r_scale, c_scale, seed=12345):
 
         import cupy as cp
 
+        from pylibraft.common import Handle
         from pylibraft.random import rmat
 
         n_edges = 5000
@@ -105,7 +108,15 @@ def rmat(out, theta, r_scale, c_scale, seed=12345):
         out = cp.empty((n_edges, 2), dtype=cp.int32)
         theta = cp.random.random_sample(theta_len, dtype=cp.float32)
 
-        rmat(out, theta, r_scale, c_scale)
+        # A single RAFT handle can optionally be reused across
+        # pylibraft functions.
+        handle = Handle()
+        ...
+        rmat(out, theta, r_scale, c_scale, handle=handle)
+        ...
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
    """
 
     if theta is None:
@@ -123,7 +134,9 @@ def rmat(out, theta, r_scale, c_scale, seed=12345):
     theta_dt = np.dtype(theta_cai["typestr"])
 
     cdef RngState *rng = new RngState(seed)
-    cdef handle_t *h = new handle_t()
+
+    handle = handle if handle is not None else Handle()
+    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
 
     if out_dt == np.int32 and theta_dt == np.float32:
         rmat_rectangular_gen(deref(h),
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index b9b4ba9e30..7f35a25493 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -17,6 +17,7 @@
 import pytest
 import numpy as np
 
+from pylibraft.common import Handle
 from pylibraft.distance import pairwise_distance
 
 from pylibraft.testing.utils import TestDeviceBuffer
@@ -53,7 +54,10 @@ def test_distance(n_rows, n_cols, metric, order, dtype):
     input1_device = TestDeviceBuffer(input1, order)
     output_device = TestDeviceBuffer(output, order)
 
+    handle = Handle()
     pairwise_distance(input1_device, input1_device, output_device, metric)
+    handle.sync()
+
     actual = output_device.copy_to_host()
 
     actual[actual <= 1e-5] = 0.0
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
new file mode 100644
index 0000000000..1ce1ee2d1f
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from scipy.spatial.distance import cdist
+import pytest
+import numpy as np
+
+from pylibraft.common import Handle
+from pylibraft.distance import fused_l2_nn_argmin
+from pylibraft.testing.utils import TestDeviceBuffer
+
+
+@pytest.mark.parametrize("n_rows", [10, 100])
+@pytest.mark.parametrize("n_clusters", [5, 10])
+@pytest.mark.parametrize("n_cols", [3, 5])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
+    input1 = np.random.random_sample((n_rows, n_cols))
+    input1 = np.asarray(input1, order="C").astype(dtype)
+
+    input2 = np.random.random_sample((n_clusters, n_cols))
+    input2 = np.asarray(input2, order="C").astype(dtype)
+
+    output = np.zeros((n_rows), dtype="int32")
+    expected = cdist(input1, input2, metric="euclidean")
+
+    expected = expected.argmin(axis=1)
+
+    input1_device = TestDeviceBuffer(input1, "C")
+    input2_device = TestDeviceBuffer(input2, "C")
+    output_device = TestDeviceBuffer(output, "C")
+
+    handle = Handle()
+    fused_l2_nn_argmin(input1_device, input2_device, output_device,
+                       True, handle=handle)
+    handle.sync()
+    actual = output_device.copy_to_host()
+
+    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/test/test_random.py
index 8a04f707de..e0b7140f1c 100644
--- a/python/pylibraft/pylibraft/test/test_random.py
+++ b/python/pylibraft/pylibraft/test/test_random.py
@@ -16,6 +16,7 @@
 import pytest
 import numpy as np
 
+from pylibraft.common import Handle
 from pylibraft.random import rmat
 
 from pylibraft.testing.utils import TestDeviceBuffer
@@ -46,14 +47,18 @@ def test_rmat(n_edges, r_scale, c_scale, dtype):
     theta, theta_device = generate_theta(r_scale, c_scale)
     out_buff = np.empty((n_edges, 2), dtype=dtype)
     output_device = TestDeviceBuffer(out_buff, "C")
-    rmat(output_device, theta_device, r_scale, c_scale, 12345)
+
+    handle = Handle()
+    rmat(output_device, theta_device, r_scale, c_scale, 12345, handle=handle)
+    handle.sync()
     output = output_device.copy_to_host()
     # a more rigorous tests have been done at the c++ level
     assert np.all(output[:, 0] >= 0)
     assert np.all(output[:, 0] < 2**r_scale)
     assert np.all(output[:, 1] >= 0)
     assert np.all(output[:, 1] < 2**c_scale)
-    rmat(output_device, theta_device, r_scale, c_scale, 12345)
+    rmat(output_device, theta_device, r_scale, c_scale, 12345, handle=handle)
+    handle.sync()
     output1 = output_device.copy_to_host()
     assert np.all(np.equal(output, output1))
 
diff --git a/python/pylibraft/pylibraft/test/test_interruptible.py b/python/pylibraft/pylibraft/test/test_z_interruptible.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_interruptible.py
rename to python/pylibraft/pylibraft/test/test_z_interruptible.py
diff --git a/python/pylibraft/pylibraft/testing/utils.py b/python/pylibraft/pylibraft/testing/utils.py
index 53115e991c..979fbb5672 100644
--- a/python/pylibraft/pylibraft/testing/utils.py
+++ b/python/pylibraft/pylibraft/testing/utils.py
@@ -21,6 +21,7 @@
 class TestDeviceBuffer:
 
     def __init__(self, ndarray, order):
+
         self.ndarray_ = ndarray
         self.device_buffer_ = \
             rmm.DeviceBuffer.to_device(ndarray.ravel(order=order).tobytes())