diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2a90a9034a..236696d948 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,4 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: false
+copy_prs: true
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
new file mode 100644
index 0000000000..0a681b864b
--- /dev/null
+++ b/.github/workflows/wheels.yml
@@ -0,0 +1,72 @@
+name: RAFT wheels
+
+on:
+  workflow_call:
+    inputs:
+      versioneer-override:
+        type: string
+        default: ''
+      build-tag:
+        type: string
+        default: ''
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+      build-type:
+        type: string
+        default: nightly
+
+concurrency:
+  group: "raft-${{ github.workflow }}-${{ github.ref }}"
+  cancel-in-progress: true
+
+jobs:
+  pylibraft-wheel:
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
+    with:
+      repo: rapidsai/raft
+
+      build-type: ${{ inputs.build-type }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+
+      package-dir: python/pylibraft
+      package-name: pylibraft
+
+      python-package-versioneer-override: ${{ inputs.versioneer-override }}
+      python-package-build-tag: ${{ inputs.build-tag }}
+
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+
+      test-extras: test
+      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+    secrets: inherit
+  raft-dask-wheel:
+    needs: pylibraft-wheel
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
+    with:
+      repo: rapidsai/raft
+
+      build-type: ${{ inputs.build-type }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+
+      package-dir: python/raft-dask
+      package-name: raft_dask
+
+      python-package-versioneer-override: ${{ inputs.versioneer-override }}
+      python-package-build-tag: ${{ inputs.build-tag }}
+
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+
+      test-extras: test
+      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
+    secrets: inherit
diff --git a/.gitignore b/.gitignore
index 22c0e8a4a0..5d148b836b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,6 @@ _skbuild
 
 ## doxygen build check inside ci/checks/style.sh
 doxygen_check/
+
+## cibuildwheel
+/wheelhouse
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..1c244200d1
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,102 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+repos:
+      - repo: https://github.com/PyCQA/isort
+        rev: 5.10.1
+        hooks:
+              - id: isort
+                # Use the config file specific to each subproject so that each
+                # project can specify its own first/third-party packages.
+                args: ["--config-root=python/", "--resolve-all-configs"]
+                files: python/.*
+                types_or: [python, cython, pyi]
+      - repo: https://github.com/psf/black
+        rev: 22.3.0
+        hooks:
+              - id: black
+                files: python/.*
+                # Explicitly specify the pyproject.toml at the repo root, not per-project.
+                args: ["--config", "pyproject.toml"]
+                exclude: .*_version.py,.*versioneer.py
+      - repo: https://github.com/PyCQA/flake8
+        rev: 5.0.4
+        hooks:
+              - id: flake8
+                args: ["--config=setup.cfg"]
+                files: python/.*$
+                types: [file]
+                types_or: [python, cython]
+                additional_dependencies: ["flake8-force"]
+      - repo: https://github.com/pre-commit/mirrors-mypy
+        rev: 'v0.971'
+        hooks:
+              - id: mypy
+                additional_dependencies: [types-cachetools]
+                args: ["--config-file=setup.cfg",
+                       "python/pylibraft/pylibraft",
+                       "python/raft-dask/raft_dask"]
+                pass_filenames: false
+                exclude: .*_version.py
+      - repo: https://github.com/PyCQA/pydocstyle
+        rev: 6.1.1
+        hooks:
+              - id: pydocstyle
+                args: ["--config=setup.cfg"]
+      - repo: https://github.com/pre-commit/mirrors-clang-format
+        rev: v11.1.0
+        hooks:
+              - id: clang-format
+                types_or: [c, c++, cuda]
+                args: ["-fallback-style=none", "-style=file", "-i"]
+                exclude: cpp/include/raft/thirdparty/.*
+      - repo: local
+        hooks:
+              - id: no-deprecationwarning
+                name: no-deprecationwarning
+                description: 'Enforce that DeprecationWarning is not introduced (use FutureWarning instead)'
+                entry: '(category=|\s)DeprecationWarning[,)]'
+                language: pygrep
+                types_or: [python, cython]
+              - id: cmake-format
+                name: cmake-format
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-format
+                language: python
+                types: [cmake]
+                exclude: .*/thirdparty/.*
+                # Note that pre-commit autoupdate does not update the versions
+                # of dependencies, so we'll have to update this manually.
+                additional_dependencies:
+                  - cmakelang==0.6.13
+                verbose: true
+                require_serial: true
+              - id: cmake-lint
+                name: cmake-lint
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
+                language: python
+                types: [cmake]
+                # Note that pre-commit autoupdate does not update the versions
+                # of dependencies, so we'll have to update this manually.
+                additional_dependencies:
+                  - cmakelang==0.6.13
+                verbose: true
+                require_serial: true
+                exclude: .*/thirdparty/.*
+              - id: copyright-check
+                name: copyright-check
+                entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
+                language: python
+                pass_filenames: false
+                additional_dependencies: [gitpython]
+              - id: include-check
+                name: include-check
+                entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/test
+                pass_filenames: false
+                language: python
+                additional_dependencies: [gitpython]
+      - repo: https://github.com/codespell-project/codespell
+        rev: v2.1.0
+        hooks:
+              - id: codespell
+
+default_language_version:
+      python: python3
diff --git a/README.md b/README.md
index ddaf8b3f8d..e48a1b6193 100755
--- a/README.md
+++ b/README.md
@@ -1,5 +1,18 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools</div>
 
+[![Build Status](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/raft/job/branches/job/raft-branch-pipeline/badge/icon)](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/raft/job/branches/job/raft-branch-pipeline/)
+
+## Resources
+
+- [RAFT Reference Documentation](https://docs.rapids.ai/api/raft/stable/): API Documentation.
+- [RAFT Getting Started](./docs/source/quick_start.md): Getting started with RAFT.
+- [Build and Install RAFT](./docs/source/build.md): Instructions for installing and building RAFT.
+- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
+- [GitHub repository](https://github.com/rapidsai/raft): Download the RAFT source code.
+- [Issue tracker](https://github.com/rapidsai/raft/issues): Report issues or request features.
+
+## Overview
+
 RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
 By taking a primitives-based approach to algorithm development, RAFT 
@@ -24,7 +37,7 @@ While not exhaustive, the following general categories help summarize the accele
 All of RAFT's C++ APIs can be accessed header-only and optional pre-compiled shared libraries can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
 In addition to the C++ library, RAFT also provides 2 Python libraries:
-- `pylibraft` - lightweight low-level Python wrappers around RAFT's host-accessable APIs.
+- `pylibraft` - lightweight low-level Python wrappers around RAFT's host-accessible APIs.
 - `raft-dask` - multi-node multi-GPU communicator infrastructure for building distributed algorithms on the GPU with Dask.
 
 ## Getting started
@@ -77,11 +90,73 @@ auto metric = raft::distance::DistanceType::L2SqrtExpanded;
 raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
 ```
 
+It's also possible to create `raft::device_mdspan` views to invoke the same API with raw pointers and shape information:
+
+```c++
+#include <raft/core/handle.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/distance/distance.cuh>
+
+raft::handle_t handle;
+
+int n_samples = 5000;
+int n_features = 50;
+
+float *input;
+int *labels;
+float *output;
+
+...
+// Allocate input, labels, and output pointers
+...
+
+auto input_view = raft::make_device_matrix_view(input, n_samples, n_features);
+auto labels_view = raft::make_device_vector_view(labels, n_samples);
+auto output_view = raft::make_device_matrix_view(output, n_samples, n_samples);
+
+raft::random::make_blobs(handle, input_view, labels_view);
+
+auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+raft::distance::pairwise_distance(handle, input_view, input_view, output_view, metric);
+```
+
+
 ### Python Example
 
 The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The number of RAFT algorithms exposed in this package is continuing to grow from release to release.
 
-The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function. Note that CuPy is not a required dependency for `pylibraft`.
+The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. Note that CuPy is not a required dependency for `pylibraft`.
+
+```python
+import cupy as cp
+
+from pylibraft.distance import pairwise_distance
+
+n_samples = 5000
+n_features = 50
+
+in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+
+output = pairwise_distance(in1, in2, metric="euclidean")
+```
+
+The `output` array supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) so it is interoperable with other libraries like CuPy, Numba, and PyTorch that also support it. 
+
+Below is an example of converting the output `pylibraft.device_ndarray` to a CuPy array:
+```python
+cupy_array = cp.asarray(output)
+```
+
+And converting to a PyTorch tensor:
+```python
+import torch
+
+torch_tensor = torch.as_tensor(output, device='cuda')
+```
+
+`pylibraft` also supports writing to a pre-allocated output array so any `__cuda_array_interface__` supported array can be written to in-place:
 
 ```python
 import cupy as cp
@@ -95,12 +170,13 @@ in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
-pairwise_distance(in1, in2, output, metric="euclidean")
+pairwise_distance(in1, in2, out=output, metric="euclidean")
 ```
 
+
 ## Installing
 
-RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
+RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), pip, or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
 
 ### Conda
 
@@ -120,6 +196,14 @@ You can also install the `libraft-*` conda packages individually using the `mamb
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
 
+### Pip
+
+pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
+```bash
+pip install pylibraft-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install raft-dask-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+```
+
 ### Cmake & CPM
 
 RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS CMake provides a convenience layer around CPM. 
@@ -229,7 +313,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
 
 ## Contributing
 
-If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md). Refer to the [Developer Guide](DEVELOPER_GUIDE.md) for details on the developer guidelines, workflows, and principals. 
+If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](docs/source/contributing.md). Refer to the [Developer Guide](docs/source/developer_guide.md) for details on the developer guidelines, workflows, and principals. 
 
 ## References
 
diff --git a/build.sh b/build.sh
index b48465922a..0708c1b89e 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
+VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench=<targets>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -33,6 +33,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
  and <flag> is:
    -v                          - verbose build mode
    -g                          - build for debug
+   -n                          - no install step
    --compile-libs              - compile shared libraries for all components
    --compile-nn                - compile shared library for nn component
    --compile-dist              - compile shared library for distance and current random components
@@ -44,7 +45,6 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. NEIGHBORS_BENCH;CLUSTER_BENCH)
    --allgpuarch                - build for all supported GPU architectures
    --buildfaiss                - build faiss statically into raft
-   --install                   - install cmake targets
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
    --cmake-args=\\\"<args>\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument)
@@ -65,12 +65,14 @@ CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=OFF
+BUILD_TYPE=Release
 BUILD_BENCH=OFF
 BUILD_STATIC_FAISS=OFF
 COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
+INSTALL_TARGET=install
 
 TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
 BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
@@ -82,7 +84,6 @@ CLEAN=0
 UNINSTALL=0
 DISABLE_DEPRECATION_WARNINGS=ON
 CMAKE_TARGET=""
-INSTALL_TARGET=""
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -190,8 +191,8 @@ if (( ${NUMARGS} != 0 )); then
 fi
 
 # Process flags
-if hasArg --install; then
-    INSTALL_TARGET="install"
+if hasArg -n; then
+    INSTALL_TARGET=""
 fi
 
 if hasArg --minimal-deps; then
@@ -336,6 +337,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
     cmake -S ${REPODIR}/cpp -B ${LIBRAFT_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
+          -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
           -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
           -DRAFT_ENABLE_NN_DEPENDENCIES=${ENABLE_NN_DEPENDENCIES} \
           -DRAFT_NVTX=${NVTX} \
diff --git a/ci/checks/black_lists.sh b/ci/checks/black_lists.sh
index 849b354d08..cf289c120c 100755
--- a/ci/checks/black_lists.sh
+++ b/ci/checks/black_lists.sh
@@ -4,7 +4,7 @@
 # RAFT black listed function call Tester #
 ##########################################
 
-# PR_TARGET_BRANCH is set by the CI enviroment
+# PR_TARGET_BRANCH is set by the CI environment
 
 git checkout --quiet $PR_TARGET_BRANCH
 
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index 6b808cc051..bfef5392f5 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -35,10 +35,9 @@
     re.compile(r"CMakeLists[.]txt$"),
     re.compile(r"CMakeLists_standalone[.]txt$"),
     re.compile(r"setup[.]cfg$"),
-    re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$")
 ]
-ExemptFiles = []
+ExemptFiles = ["cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh"]
 
 # this will break starting at year 10000, which is probably OK :)
 CheckSimple = re.compile(
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index fb5a64fdac..f8fcbe19f8 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -12,69 +12,12 @@ PATH=/opt/conda/bin:$PATH
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-# Run flake8 and get results/return code
-FLAKE=`flake8 --exclude=cpp,thirdparty,__init__.py,versioneer.py && flake8 --config=python/.flake8.cython`
-RETVAL=$?
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/cmake-format-rapids-cmake.json
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
 
-# Output results if failure otherwise show pass
-if [ "$FLAKE" != "" ]; then
-  echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n"
-  echo -e "$FLAKE"
-  echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: flake8 style check\n\n"
-fi
-
-# Check for copyright headers in the files modified currently
-COPYRIGHT=`python ci/checks/copyright.py --git-modified-only 2>&1`
-CR_RETVAL=$?
-if [ "$RETVAL" = "0" ]; then
-  RETVAL=$CR_RETVAL
-fi
-
-# Output results if failure otherwise show pass
-if [ "$CR_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n"
-  echo -e "$COPYRIGHT"
-  echo -e "\n\n>>>> FAILED: copyright check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: copyright check\n\n"
-fi
-
-# Check for a consistent #include syntax
-HASH_INCLUDE=`python cpp/scripts/include_checker.py \
-                     cpp/bench \
-                     cpp/include \
-                     cpp/test \
-                     2>&1`
-HASH_RETVAL=$?
-if [ "$RETVAL" = "0" ]; then
-  RETVAL=$HASH_RETVAL
-fi
-
-# Output results if failure otherwise show pass
-if [ "$HASH_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: #include check; begin output\n\n"
-  echo -e "$HASH_INCLUDE"
-  echo -e "\n\n>>>> FAILED: #include check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: #include check\n\n"
-fi
-
-# Check for a consistent code format
-FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
-FORMAT_RETVAL=$?
-if [ "$RETVAL" = "0" ]; then
-  RETVAL=$FORMAT_RETVAL
-fi
-
-# Output results if failure otherwise show pass
-if [ "$FORMAT_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
-  echo -e "$FORMAT"
-  echo -e "\n\n>>>> FAILED: clang format check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: clang format check\n\n"
-fi
+# Run pre-commit checks
+pre-commit run --hook-stage manual --all-files
 
 exit $RETVAL
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 98cb46064c..3162802cbc 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -123,5 +123,5 @@ pytest --cache-clear --junitxml="$WORKSPACE/junit-raft-dask.xml" -v -s
 if [ "$(arch)" = "x86_64" ]; then
   gpuci_logger "Building docs"
   gpuci_mamba_retry install "rapids-doc-env=${MINOR_VERSION}.*"
-  "$WORKSPACE/build.sh" docs -v
+  "$WORKSPACE/build.sh" docs -v -n
 fi
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index caff6996e8..afb2657356 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -14,7 +14,7 @@ dependencies:
 - clang=11.1.0
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
-- cmake>=3.23.1
+- cmake>=3.23.1,!=3.25.0
 - dask>=2022.9.2
 - distributed>=2022.9.2
 - scikit-build>=0.13.1
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index f6b91e0825..54b3f48fb0 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -14,7 +14,7 @@ dependencies:
 - clang=11.1.0
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
-- cmake>=3.23.1
+- cmake>=3.23.1,!=3.25.0
 - dask>=2022.9.2
 - distributed>=2022.9.2
 - scikit-build>=0.13.1
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 66f6511d6f..6555e5cc83 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -14,7 +14,7 @@ dependencies:
 - clang=11.1.0
 - clang-tools=11.1.0
 - cython>=0.29,<0.30
-- cmake>=3.23.1
+- cmake>=3.23.1,!=3.25.0
 - dask>=2022.9.2
 - distributed>=2022.9.2
 - scikit-build>=0.13.1
diff --git a/conda/recipes/libraft/build_libraft_distance.sh b/conda/recipes/libraft/build_libraft_distance.sh
index 35a669d6df..35bf354e9b 100644
--- a/conda/recipes/libraft/build_libraft_distance.sh
+++ b/conda/recipes/libraft/build_libraft_distance.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft --install -v --allgpuarch --compile-dist --no-nvtx
+./build.sh libraft -v --allgpuarch --compile-dist --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_headers.sh b/conda/recipes/libraft/build_libraft_headers.sh
index 02ef674787..7bd678c07a 100644
--- a/conda/recipes/libraft/build_libraft_headers.sh
+++ b/conda/recipes/libraft/build_libraft_headers.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft --install -v --allgpuarch --no-nvtx
+./build.sh libraft -v --allgpuarch --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_nn.sh b/conda/recipes/libraft/build_libraft_nn.sh
index caa643a356..773d6ab02e 100644
--- a/conda/recipes/libraft/build_libraft_nn.sh
+++ b/conda/recipes/libraft/build_libraft_nn.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft --install -v --allgpuarch --compile-nn --no-nvtx
+./build.sh libraft -v --allgpuarch --compile-nn --no-nvtx
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index c4d0c2a087..fc77dfc89b 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -11,7 +11,7 @@ sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1"
+  - ">=3.23.1,!=3.25.0"
 
 nccl_version:
   - ">=2.9.9"
diff --git a/conda/recipes/pylibraft/build.sh b/conda/recipes/pylibraft/build.sh
index 4e64d031ec..2f02fb5a4c 100644
--- a/conda/recipes/pylibraft/build.sh
+++ b/conda/recipes/pylibraft/build.sh
@@ -2,4 +2,4 @@
 #!/usr/bin/env bash
 
 # This assumes the script is executed from the root of the repo directory
-./build.sh pylibraft --install --no-nvtx
+./build.sh pylibraft --no-nvtx
diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
index 725c38cb6a..f16406336b 100644
--- a/conda/recipes/pylibraft/conda_build_config.yaml
+++ b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -11,4 +11,4 @@ sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1"
+  - ">=3.23.1,!=3.25.0"
diff --git a/conda/recipes/raft-dask/build.sh b/conda/recipes/raft-dask/build.sh
index 963433dd8d..ec81224e03 100644
--- a/conda/recipes/raft-dask/build.sh
+++ b/conda/recipes/raft-dask/build.sh
@@ -1,6 +1,5 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 #!/usr/bin/env bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
-./build.sh raft-dask --install --no-nvtx
+./build.sh raft-dask --no-nvtx
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index a6ca533504..3b42dab182 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -14,4 +14,4 @@ ucx_version:
   - "1.13.0"
 
 cmake_version:
-  - ">=3.23.1"
+  - ">=3.23.1,!=3.25.0"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fff2148e7e..8a006f01df 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,18 +1,15 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
 set(RAPIDS_VERSION "23.02")
 set(RAFT_VERSION "23.02.00")
 
@@ -26,18 +23,17 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION ${RAFT_VERSION} LANGUAGES CXX CUDA)
-
-# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
-# have different values for the `Threads::Threads` target. Setting this flag ensures
-# `Threads::Threads` is the same value in first run and subsequent runs.
-set(THREADS_PREFER_PTHREAD_FLAG ON)
+project(
+  RAFT
+  VERSION ${RAFT_VERSION}
+  LANGUAGES CXX CUDA
+)
 
 # Write the version header
 rapids_cmake_write_version_file(include/raft/version_config.hpp)
 
-##############################################################################
-# - build type ---------------------------------------------------------------
+# ##################################################################################################
+# * build type ---------------------------------------------------------------
 
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
@@ -45,14 +41,16 @@ rapids_cmake_build_type(Release)
 # this is needed for clang-tidy runs
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-##############################################################################
-# - User Options  ------------------------------------------------------------
+# ##################################################################################################
+# * User Options  ------------------------------------------------------------
 
 option(BUILD_SHARED_LIBS "Build raft shared libraries" ON)
 option(BUILD_TESTS "Build raft unit-tests" ON)
 option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
-option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
+option(CUDA_ENABLE_LINEINFO
+       "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
+)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
@@ -60,12 +58,26 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(RAFT_NVTX "Enable nvtx markers" OFF)
 
 option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ${BUILD_TESTS})
-option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" ${RAFT_COMPILE_LIBRARIES})
-option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" ${RAFT_COMPILE_LIBRARIES})
-option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
+option(
+  RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations"
+  ${RAFT_COMPILE_LIBRARIES}
+)
+option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations"
+       ${RAFT_COMPILE_LIBRARIES}
+)
+option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss"
+       ${RAFT_COMPILE_LIBRARIES}
+)
 
 option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
 
+if(BUILD_TESTS OR BUILD_BENCH)
+  # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
+  # to have different values for the `Threads::Threads` target. Setting this flag ensures
+  # `Threads::Threads` is the same value in first run and subsequent runs.
+  set(THREADS_PREFER_PTHREAD_FLAG ON)
+endif()
+
 if(BUILD_TESTS AND NOT RAFT_ENABLE_thrust_DEPENDENCY)
   message(VERBOSE "RAFT: BUILD_TESTS is enabled, overriding RAFT_ENABLE_thrust_DEPENDENCY")
   set(RAFT_ENABLE_thrust_DEPENDENCY ON)
@@ -74,7 +86,13 @@ endif()
 option(RAFT_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' target" ON)
 
 include(CMakeDependentOption)
-cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
+cmake_dependent_option(
+  RAFT_USE_FAISS_STATIC
+  "Build and statically link the FAISS library for nearest neighbors search on GPU"
+  ON
+  RAFT_COMPILE_LIBRARIES
+  OFF
+)
 
 message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}")
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
@@ -85,43 +103,45 @@ message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}")
 message(VERBOSE "RAFT: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}")
 message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}")
 message(VERBOSE "RAFT: Enable nvtx markers: ${RAFT_NVTX}")
-message(VERBOSE "RAFT: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}")
+message(VERBOSE
+        "RAFT: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}"
+)
 
 # Set RMM logging level
-set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
-set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
+set(RMM_LOGGING_LEVEL
+    "INFO"
+    CACHE STRING "Choose the logging level."
+)
+set_property(
+  CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF"
+)
 message(VERBOSE "RAFT: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 
-##############################################################################
-# - Conda environment detection ----------------------------------------------
+# ##################################################################################################
+# * Conda environment detection ----------------------------------------------
 
 if(DETECT_CONDA_ENV)
-  rapids_cmake_support_conda_env( conda_env MODIFY_PREFIX_PATH )
-  if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND DEFINED ENV{CONDA_PREFIX})
-    message(STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}")
+  rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
+  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND DEFINED ENV{CONDA_PREFIX})
+    message(
+      STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}"
+    )
     set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
   endif()
 endif()
 
-##############################################################################
-# - compiler options ---------------------------------------------------------
+# ##################################################################################################
+# * compiler options ---------------------------------------------------------
 
 set(_ctk_static_suffix "")
 if(CUDA_STATIC_RUNTIME)
-  # If we're statically linking CTK cuBLAS,
-  # we also want to statically link BLAS
-  set(BLA_STATIC ON)
   set(_ctk_static_suffix "_static")
-  # Control legacy FindCUDA.cmake behavior too
-  # Remove this after we push it into rapids-cmake:
-  # https://github.com/rapidsai/rapids-cmake/pull/259
-  set(CUDA_USE_STATIC_CUDA_RUNTIME ON)
 endif()
 
 # CUDA runtime
 rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME})
 
-if (NOT DISABLE_OPENMP)
+if(NOT DISABLE_OPENMP)
   find_package(OpenMP)
   if(OPENMP_FOUND)
     message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
@@ -132,13 +152,15 @@ endif()
 # * determine GPU architectures
 # * enable the CMake CUDA language
 # * set other CUDA compilation flags
-rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports)
+rapids_find_package(
+  CUDAToolkit REQUIRED
+  BUILD_EXPORT_SET raft-exports
+  INSTALL_EXPORT_SET raft-exports
+)
 include(cmake/modules/ConfigureCUDA.cmake)
 
-##############################################################################
-# - Requirements -------------------------------------------------------------
+# ##################################################################################################
+# * Requirements -------------------------------------------------------------
 
 if(RAFT_COMPILE_LIBRARIES)
   set(RAFT_COMPILE_DIST_LIBRARY ON)
@@ -156,10 +178,13 @@ rapids_cpm_init()
 include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
 include(cmake/thirdparty/get_faiss.cmake)
+include(cmake/thirdparty/get_cutlass.cmake)
 
 if(RAFT_ENABLE_cuco_DEPENDENCY)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
-  rapids_cpm_cuco(BUILD_EXPORT_SET raft-distance-lib-exports INSTALL_EXPORT_SET  raft-distance-lib-exports)
+  rapids_cpm_cuco(
+    BUILD_EXPORT_SET raft-distance-lib-exports INSTALL_EXPORT_SET raft-distance-lib-exports
+  )
 endif()
 
 if(BUILD_TESTS)
@@ -171,69 +196,77 @@ if(BUILD_BENCH)
   rapids_cpm_gbench()
 endif()
 
-##############################################################################
-# - raft ---------------------------------------------------------------------
+# ##################################################################################################
+# * raft ---------------------------------------------------------------------
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 
-target_include_directories(raft INTERFACE
-        "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-        "$<INSTALL_INTERFACE:include>")
-
-# Keep RAFT as lightweight as possible.
-# Only CUDA libs and rmm should
-# be used in global target.
-target_link_libraries(raft INTERFACE
-        rmm::rmm
-        CUDA::cublas${_ctk_static_suffix}
-        CUDA::curand${_ctk_static_suffix}
-        CUDA::cusolver${_ctk_static_suffix}
-        CUDA::cusparse${_ctk_static_suffix}
-        $<$<BOOL:${RAFT_ENABLE_thrust_DEPENDENCY}>:raft::Thrust>
+target_include_directories(
+  raft INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:include>"
+)
+
+# Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target.
+target_link_libraries(
+  raft
+  INTERFACE rmm::rmm
+            CUDA::cublas${_ctk_static_suffix}
+            CUDA::curand${_ctk_static_suffix}
+            CUDA::cusolver${_ctk_static_suffix}
+            CUDA::cusparse${_ctk_static_suffix}
+            $<$<BOOL:${RAFT_ENABLE_thrust_DEPENDENCY}>:raft::Thrust>
 )
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
-  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-          [=[
+  file(
+    WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
+    [=[
 SECTIONS
 {
 .nvFatBinSegment : { *(.nvFatBinSegment) }
 .nv_fatbin : { *(.nv_fatbin) }
 }
-]=])
+]=]
+  )
 endif()
 
-##############################################################################
-# - NVTX support in raft -----------------------------------------------------
+# ##################################################################################################
+# * NVTX support in raft -----------------------------------------------------
 
-if (RAFT_NVTX)
+if(RAFT_NVTX)
   # This enables NVTX within the project with no option to disable it downstream.
   target_link_libraries(raft INTERFACE CUDA::nvToolsExt)
   target_compile_definitions(raft INTERFACE NVTX_ENABLED)
 else()
-  # Allow enable NVTX downstream if not set here.
-  # This creates a new option at build/install time, which is set by default to OFF,
-  # but can be enabled in the dependent project.
-  get_property(nvtx_option_help_string CACHE RAFT_NVTX PROPERTY HELPSTRING)
-  string(CONCAT nvtx_export_string
-  "option(RAFT_NVTX \"" ${nvtx_option_help_string} "\" OFF)"
-  [=[
+  # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
+  # which is set by default to OFF, but can be enabled in the dependent project.
+  get_property(
+    nvtx_option_help_string
+    CACHE RAFT_NVTX
+    PROPERTY HELPSTRING
+  )
+  string(
+    CONCAT
+      nvtx_export_string
+      "option(RAFT_NVTX \""
+      ${nvtx_option_help_string}
+      "\" OFF)"
+      [=[
 
 target_link_libraries(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:CUDA::nvToolsExt>)
 target_compile_definitions(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:NVTX_ENABLED>)
 
-  ]=])
+  ]=]
+  )
 endif()
 
-##############################################################################
-# - raft_distance ------------------------------------------------------------
-# TODO:
-# Currently, this package also contains the 'random' namespace (for rmat logic)
-# We couldn't get this to work properly due to strange CI failures as noticed
-# in the PR#778. In the long term, we should rename this package to `raft_compiled`
-# in order to have a single pre-compiled raft package for those who need it.
+# ##################################################################################################
+# * raft_distance ------------------------------------------------------------ TODO: Currently, this
+#   package also contains the 'random' namespace (for rmat logic) We couldn't get this to work
+#   properly due to strange CI failures as noticed in the PR#778. In the long term, we should rename
+#   this package to `raft_compiled` in order to have a single pre-compiled raft package for those
+#   who need it.
 add_library(raft_distance INTERFACE)
 
 if(TARGET raft_distance AND (NOT TARGET raft::distance))
@@ -243,11 +276,14 @@ endif()
 set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
 if(RAFT_COMPILE_DIST_LIBRARY)
-  add_library(raft_distance_lib
+  add_library(
+    raft_distance_lib
     src/distance/pairwise_distance.cu
     src/distance/fused_l2_min_arg.cu
     src/distance/update_centroids_float.cu
     src/distance/update_centroids_double.cu
+    src/distance/cluster_cost_float.cu
+    src/distance/cluster_cost_double.cu
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
@@ -262,9 +298,9 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
     src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
     src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
-# These are somehow missing a kernel definition which is causing a compile error.
-#    src/distance/specializations/detail/kernels/rbf_kernel_double.cu
-#    src/distance/specializations/detail/kernels/rbf_kernel_float.cu
+    # These are somehow missing a kernel definition which is causing a compile error.
+    # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+    # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
     src/distance/specializations/detail/kernels/tanh_kernel_double.cu
     src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
@@ -295,6 +331,21 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
     src/distance/specializations/fused_l2_nn_float_int64.cu
+    src/nn/specializations/detail/ivfpq_build.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_search.cu
+    src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
     src/random/specializations/rmat_rectangular_generator_int_double.cu
     src/random/specializations/rmat_rectangular_generator_int64_double.cu
     src/random/specializations/rmat_rectangular_generator_int_float.cu
@@ -302,26 +353,27 @@ if(RAFT_COMPILE_DIST_LIBRARY)
   )
   set_target_properties(
     raft_distance_lib
-    PROPERTIES OUTPUT_NAME                         raft_distance
-               BUILD_RPATH                         "\$ORIGIN"
-               INSTALL_RPATH                       "\$ORIGIN"
-               CXX_STANDARD                        17
-               CXX_STANDARD_REQUIRED               ON
-               CUDA_STANDARD                       17
-               CUDA_STANDARD_REQUIRED              ON
-               POSITION_INDEPENDENT_CODE           ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON)
-
-  target_link_libraries(raft_distance_lib
-    PUBLIC raft::raft
-           cuco::cuco
-    )
-  target_compile_options(raft_distance_lib
-          PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-          )
-  target_compile_definitions(raft_distance_lib
-          INTERFACE "RAFT_DISTANCE_COMPILED")
+    PROPERTIES OUTPUT_NAME raft_distance
+               BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  target_link_libraries(
+    raft_distance_lib
+    PUBLIC raft::raft cuco::cuco
+    PRIVATE nvidia::cutlass::cutlass
+  )
+  target_compile_options(
+    raft_distance_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                              "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+  target_compile_definitions(raft_distance_lib INTERFACE "RAFT_DISTANCE_COMPILED")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_distance_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
@@ -332,13 +384,13 @@ if(TARGET raft_distance_lib AND (NOT TARGET raft::raft_distance_lib))
   add_library(raft::raft_distance_lib ALIAS raft_distance_lib)
 endif()
 
-target_link_libraries(raft_distance INTERFACE
-    raft::raft
-    $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
+target_link_libraries(
+  raft_distance INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
+                          nvidia::cutlass::cutlass
 )
 
-##############################################################################
-# - raft_nn ------------------------------------------------------------------
+# ##################################################################################################
+# * raft_nn ------------------------------------------------------------------
 add_library(raft_nn INTERFACE)
 
 if(TARGET raft_nn AND (NOT TARGET raft::nn))
@@ -348,126 +400,170 @@ endif()
 set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
 
 if(RAFT_COMPILE_NN_LIBRARY)
-  add_library(raft_nn_lib
-          src/nn/specializations/ball_cover.cu
-          src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-          src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-          src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-          src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
-          src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
-          src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
-          src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
-          src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
-          src/nn/specializations/fused_l2_knn_long_float_true.cu
-          src/nn/specializations/fused_l2_knn_long_float_false.cu
-          src/nn/specializations/fused_l2_knn_int_float_true.cu
-          src/nn/specializations/fused_l2_knn_int_float_false.cu
-          src/nn/specializations/knn.cu
-          )
+  add_library(
+    raft_nn_lib
+    src/nn/specializations/ball_cover.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_float_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_float_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8s_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_fp8u_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_half_fast.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_basediff.cu
+    src/nn/specializations/detail/ivfpq_compute_similarity_half_no_smem_lut.cu
+    src/nn/specializations/detail/ivfpq_build.cu
+    src/nn/specializations/detail/ivfpq_search.cu
+    src/nn/specializations/detail/ivfpq_search_float_int64_t.cu
+    src/nn/specializations/detail/ivfpq_search_float_uint32_t.cu
+    src/nn/specializations/detail/ivfpq_search_float_uint64_t.cu
+    src/nn/specializations/fused_l2_knn_long_float_true.cu
+    src/nn/specializations/fused_l2_knn_long_float_false.cu
+    src/nn/specializations/fused_l2_knn_int_float_true.cu
+    src/nn/specializations/fused_l2_knn_int_float_false.cu
+    src/nn/specializations/knn.cu
+  )
   set_target_properties(
     raft_nn_lib
-    PROPERTIES OUTPUT_NAME                         raft_nn
-               BUILD_RPATH                         "\$ORIGIN"
-               INSTALL_RPATH                       "\$ORIGIN"
-               CXX_STANDARD                        17
-               CXX_STANDARD_REQUIRED               ON
-               CUDA_STANDARD                       17
-               CUDA_STANDARD_REQUIRED              ON
-               POSITION_INDEPENDENT_CODE           ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON)
-
-  target_link_libraries(raft_nn_lib
-    PUBLIC faiss::faiss
-           raft::raft)
-  target_compile_options(raft_nn_lib
-          PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-          )
+    PROPERTIES OUTPUT_NAME raft_nn
+               BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  target_link_libraries(
+    raft_nn_lib
+    PUBLIC faiss::faiss raft::raft
+    PRIVATE nvidia::cutlass::cutlass
+  )
+  target_compile_options(
+    raft_nn_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_nn_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
-  target_compile_definitions(raft_nn_lib
-          INTERFACE "RAFT_NN_COMPILED")
+  target_compile_definitions(raft_nn_lib INTERFACE "RAFT_NN_COMPILED")
 endif()
 
 if(TARGET raft_nn_lib AND (NOT TARGET raft::raft_nn_lib))
   add_library(raft::raft_nn_lib ALIAS raft_nn_lib)
 endif()
 
-target_link_libraries(raft_nn INTERFACE
-    raft::raft
-    $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
+target_link_libraries(
+  raft_nn INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib> nvidia::cutlass::cutlass
+)
+
+# ##################################################################################################
+# * raft_distributed -------------------------------------------------------------------------------
+add_library(raft_distributed INTERFACE)
+
+if(TARGET raft_distributed AND (NOT TARGET raft::distributed))
+  add_library(raft::distributed ALIAS raft_distributed)
+endif()
+
+set_target_properties(raft_distributed PROPERTIES EXPORT_NAME distributed)
+
+rapids_export_package(BUILD ucx raft-distributed-exports)
+rapids_export_package(INSTALL ucx raft-distributed-exports)
 
-##############################################################################
-# - install targets-----------------------------------------------------------
-rapids_cmake_install_lib_dir( lib_dir )
+target_link_libraries(raft_distributed INTERFACE ucx::ucp)
+
+# ##################################################################################################
+# * install targets-----------------------------------------------------------
+rapids_cmake_install_lib_dir(lib_dir)
 include(GNUInstallDirs)
 include(CPack)
 
-install(TARGETS raft
-        DESTINATION ${lib_dir}
-        COMPONENT raft
-        EXPORT raft-exports)
+install(
+  TARGETS raft
+  DESTINATION ${lib_dir}
+  COMPONENT raft
+  EXPORT raft-exports
+)
 
-install(TARGETS raft_distance
-        DESTINATION ${lib_dir}
-        COMPONENT raft
-        EXPORT raft-distance-exports)
+install(
+  TARGETS raft_distance
+  DESTINATION ${lib_dir}
+  COMPONENT raft
+  EXPORT raft-distance-exports
+)
 
-install(TARGETS raft_nn
-        DESTINATION ${lib_dir}
-        COMPONENT raft
-        EXPORT raft-nn-exports)
+install(
+  TARGETS raft_nn
+  DESTINATION ${lib_dir}
+  COMPONENT raft
+  EXPORT raft-nn-exports
+)
 
 if(TARGET raft_distance_lib)
-  install(TARGETS raft_distance_lib
-          DESTINATION ${lib_dir}
-          COMPONENT distance
-          EXPORT raft-distance-lib-exports)
-  install(DIRECTORY include/raft_distance
-          DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-          COMPONENT distance)
+  install(
+    TARGETS raft_distance_lib
+    DESTINATION ${lib_dir}
+    COMPONENT distance
+    EXPORT raft-distance-lib-exports
+  )
+  install(
+    DIRECTORY include/raft_distance
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    COMPONENT distance
+  )
 endif()
 
 if(TARGET raft_nn_lib)
-  install(TARGETS raft_nn_lib
-          DESTINATION ${lib_dir}
-          COMPONENT nn
-          EXPORT raft-nn-lib-exports)
+  install(
+    TARGETS raft_nn_lib
+    DESTINATION ${lib_dir}
+    COMPONENT nn
+    EXPORT raft-nn-lib-exports
+  )
 endif()
 
+install(
+  TARGETS raft_distributed
+  DESTINATION ${lib_dir}
+  COMPONENT distributed
+  EXPORT raft-distributed-exports
+)
 
-install(DIRECTORY include/raft
-        COMPONENT raft
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(
+  DIRECTORY include/raft
+  COMPONENT raft
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
 
 # Temporary install of raft.hpp while the file is removed
-install(FILES include/raft.hpp
-        COMPONENT raft
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
+install(
+  FILES include/raft.hpp
+  COMPONENT raft
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
+)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/raft/version_config.hpp
-        COMPONENT raft
-        DESTINATION include/raft)
+install(
+  FILES ${CMAKE_CURRENT_BINARY_DIR}/include/raft/version_config.hpp
+  COMPONENT raft
+  DESTINATION include/raft
+)
 
-##############################################################################
-# - export/install optional components  --------------------------------------
+# ##################################################################################################
+# * export/install optional components  --------------------------------------
 
 include("${rapids-cmake-dir}/export/write_dependencies.cmake")
 
-set(raft_components distance nn)
-set(raft_install_comp raft raft)
+set(raft_components distance nn distributed)
+set(raft_install_comp raft raft raft)
 if(TARGET raft_distance_lib)
   list(APPEND raft_components distance-lib)
   list(APPEND raft_install_comp distance)
@@ -479,30 +575,31 @@ endif()
 
 foreach(comp install_comp IN ZIP_LISTS raft_components raft_install_comp)
   install(
-          EXPORT raft-${comp}-exports
-          FILE raft-${comp}-targets.cmake
-          NAMESPACE raft::
-          DESTINATION "${lib_dir}/cmake/raft"
-          COMPONENT ${install_comp}
+    EXPORT raft-${comp}-exports
+    FILE raft-${comp}-targets.cmake
+    NAMESPACE raft::
+    DESTINATION "${lib_dir}/cmake/raft"
+    COMPONENT ${install_comp}
   )
   export(
-          EXPORT raft-${comp}-exports
-          FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
-          NAMESPACE raft::
+    EXPORT raft-${comp}-exports
+    FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
+    NAMESPACE raft::
   )
   rapids_export_write_dependencies(
-          BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
+    BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
   )
   rapids_export_write_dependencies(
-          INSTALL raft-${comp}-exports "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/${install_comp}/raft-${comp}-dependencies.cmake"
+    INSTALL raft-${comp}-exports
+    "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/${install_comp}/raft-${comp}-dependencies.cmake"
   )
 
 endforeach()
 
-##############################################################################
-# - install export -----------------------------------------------------------
+# ##################################################################################################
+# * install export -----------------------------------------------------------
 set(doc_string
-        [=[
+    [=[
 Provide targets for the RAFT: Reusable Accelerated Functions and Tools
 
 RAFT contains fundamental widely-used algorithms and primitives
@@ -511,27 +608,35 @@ for data science and machine learning.
 Optional Components:
   - nn
   - distance
+  - distributed
 
 Imported Targets:
   - raft::raft
   - raft::nn brought in by the `nn` optional component
   - raft::distance brought in by the `distance` optional component
+  - raft::distributed brought in by the `distributed` optional component
 
-]=])
+]=]
+)
 
 set(code_string ${nvtx_export_string})
 
 if(RAFT_ENABLE_thrust_DEPENDENCY)
-  string(APPEND code_string
-  [=[
+  string(
+    APPEND
+    code_string
+    [=[
   if(NOT TARGET raft::Thrust)
     thrust_create_target(raft::Thrust FROM_OPTIONS)
   endif()
-  ]=])
+  ]=]
+  )
 endif()
 
-string(APPEND code_string
-[=[
+string(
+  APPEND
+  code_string
+  [=[
 if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()
@@ -545,45 +650,58 @@ if(nn IN_LIST raft_FIND_COMPONENTS)
       add_library(faiss ALIAS faiss::faiss)
   endif()
 endif()
-]=])
+]=]
+)
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
-raft_export(INSTALL raft
-        COMPONENTS nn distance
-        EXPORT_SET raft-exports
-        GLOBAL_TARGETS raft nn distance
-        NAMESPACE raft::
-        DOCUMENTATION doc_string
-        FINAL_CODE_BLOCK code_string)
-
-##############################################################################
-# - build export -------------------------------------------------------------
-raft_export(BUILD raft
-        EXPORT_SET raft-exports
-        COMPONENTS nn distance
-        GLOBAL_TARGETS raft raft_distance raft_nn
-        DOCUMENTATION doc_string
-        NAMESPACE raft::
-        FINAL_CODE_BLOCK code_string)
-
-##############################################################################
-# - build test executable ----------------------------------------------------
+raft_export(
+  INSTALL raft COMPONENTS nn distance distributed EXPORT_SET raft-exports GLOBAL_TARGETS raft nn
+  distance distributed NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string
+)
+
+# ##################################################################################################
+# * build export -------------------------------------------------------------
+raft_export(
+  BUILD
+  raft
+  EXPORT_SET
+  raft-exports
+  COMPONENTS
+  nn
+  distance
+  distributed
+  GLOBAL_TARGETS
+  raft
+  raft_distance
+  distributed
+  raft_nn
+  DOCUMENTATION
+  doc_string
+  NAMESPACE
+  raft::
+  FINAL_CODE_BLOCK
+  code_string
+)
+
+# ##################################################################################################
+# * build test executable ----------------------------------------------------
 
 if(BUILD_TESTS)
   include(test/CMakeLists.txt)
 endif()
 
-##############################################################################
-# - build benchmark executable -----------------------------------------------
+# ##################################################################################################
+# * build benchmark executable -----------------------------------------------
 
 if(BUILD_BENCH)
   include(bench/CMakeLists.txt)
 endif()
 
-##############################################################################
-# - doxygen targets ----------------------------------------------------------
+# ##################################################################################################
+# * doxygen targets ----------------------------------------------------------
 include(cmake/doxygen.cmake)
-add_doxygen_target(IN_DOXYFILE doxygen/Doxyfile.in
-        OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-        CWD ${CMAKE_CURRENT_BINARY_DIR})
+add_doxygen_target(
+  IN_DOXYFILE doxygen/Doxyfile.in OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile CWD
+  ${CMAKE_CURRENT_BINARY_DIR}
+)
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 81e894fbbc..4e6b6ceb40 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -1,141 +1,134 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
-###################################################################################################
-# - compiler function -----------------------------------------------------------------------------
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
 
 function(ConfigureBench)
 
-set(options OPTIONAL DIST NN)
-set(oneValueArgs NAME )
-set(multiValueArgs PATH TARGETS CONFIGURATIONS)
-
-cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}"
-        "${multiValueArgs}" ${ARGN} )
-
-set(BENCH_NAME ${ConfigureBench_NAME})
-
-add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
-
-target_link_libraries(${BENCH_NAME}
-        PRIVATE
-        raft::raft
-        $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
-        $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
-        benchmark::benchmark
-        Threads::Threads
-        $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-        $<TARGET_NAME_IF_EXISTS:conda_env>
-        )
-
-set_target_properties(${BENCH_NAME}
-        PROPERTIES
-        # set target compile options
-        INSTALL_RPATH "\$ORIGIN/../../../lib"
-        CXX_STANDARD                        17
-        CXX_STANDARD_REQUIRED               ON
-        CUDA_STANDARD                       17
-        CUDA_STANDARD_REQUIRED              ON
-        POSITION_INDEPENDENT_CODE           ON
-        INTERFACE_POSITION_INDEPENDENT_CODE ON
-        )
-
-target_compile_options(${BENCH_NAME}
-        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-        )
-
-target_include_directories(${BENCH_NAME}
-        PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>"
-        )
-
-install(
-        TARGETS ${BENCH_NAME}
-        COMPONENT testing
-        DESTINATION bin/gbench/libraft
-        EXCLUDE_FROM_ALL)
+  set(options OPTIONAL DIST NN)
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
+
+  cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(BENCH_NAME ${ConfigureBench_NAME})
+
+  add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
+
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
+            $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
+            benchmark::benchmark
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  target_include_directories(${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>")
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/gbench/libraft
+    EXCLUDE_FROM_ALL
+  )
 
 endfunction()
 
 if(BUILD_BENCH)
-    ConfigureBench(NAME CLUSTER_BENCH
-            PATH
-            bench/cluster/kmeans_balanced.cu
-            bench/cluster/kmeans.cu
-            bench/main.cpp
-            OPTIONAL DIST NN
-            )
-
-    ConfigureBench(NAME DISTANCE_BENCH
-            PATH
-            bench/distance/distance_cosine.cu
-            bench/distance/distance_exp_l2.cu
-            bench/distance/distance_l1.cu
-            bench/distance/distance_unexp_l2.cu
-            bench/distance/fused_l2_nn.cu
-            bench/distance/kernels.cu
-            bench/main.cpp
-            OPTIONAL DIST
-            )
-
-    ConfigureBench(NAME LINALG_BENCH
-            PATH
-            bench/linalg/add.cu
-            bench/linalg/map_then_reduce.cu
-            bench/linalg/matrix_vector_op.cu
-            bench/linalg/reduce_rows_by_key.cu
-            bench/linalg/reduce.cu
-            bench/main.cpp
-            )
-
-    ConfigureBench(NAME MATRIX_BENCH
-            PATH
-            bench/matrix/argmin.cu
-            bench/main.cpp
-            )
-
-    ConfigureBench(NAME RANDOM_BENCH
-            PATH
-            bench/random/make_blobs.cu
-            bench/random/permute.cu
-            bench/random/rng.cu
-            bench/main.cpp
-            )
-
-    ConfigureBench(NAME SPARSE_BENCH
-            PATH
-            bench/sparse/convert_csr.cu
-            bench/main.cpp
-            )
-
-    ConfigureBench(NAME NEIGHBORS_BENCH
-            PATH
-            bench/neighbors/knn/brute_force_float_int64_t.cu
-            bench/neighbors/knn/brute_force_float_uint32_t.cu
-            bench/neighbors/knn/ivf_flat_float_int64_t.cu
-            bench/neighbors/knn/ivf_flat_float_uint32_t.cu
-            bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
-            bench/neighbors/knn/ivf_flat_uint8_t_uint32_t.cu
-            bench/neighbors/knn/ivf_pq_float_int64_t.cu
-            bench/neighbors/knn/ivf_pq_float_uint32_t.cu
-            bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
-            bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
-            bench/neighbors/selection.cu
-            bench/main.cpp
-            OPTIONAL DIST NN
-            )
+  ConfigureBench(
+    NAME CLUSTER_BENCH PATH bench/cluster/kmeans_balanced.cu bench/cluster/kmeans.cu bench/main.cpp
+    OPTIONAL DIST NN
+  )
+
+  ConfigureBench(
+    NAME
+    DISTANCE_BENCH
+    PATH
+    bench/distance/distance_cosine.cu
+    bench/distance/distance_exp_l2.cu
+    bench/distance/distance_l1.cu
+    bench/distance/distance_unexp_l2.cu
+    bench/distance/fused_l2_nn.cu
+    bench/distance/kernels.cu
+    bench/main.cpp
+    OPTIONAL
+    DIST
+  )
+
+  ConfigureBench(
+    NAME
+    LINALG_BENCH
+    PATH
+    bench/linalg/add.cu
+    bench/linalg/map_then_reduce.cu
+    bench/linalg/matrix_vector_op.cu
+    bench/linalg/norm.cu
+    bench/linalg/normalize.cu
+    bench/linalg/reduce_rows_by_key.cu
+    bench/linalg/reduce.cu
+    bench/main.cpp
+  )
+
+  ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/main.cpp)
+
+  ConfigureBench(
+    NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu
+    bench/main.cpp
+  )
+
+  ConfigureBench(NAME SPARSE_BENCH PATH bench/sparse/convert_csr.cu bench/main.cpp)
+
+  ConfigureBench(
+    NAME
+    NEIGHBORS_BENCH
+    PATH
+    bench/neighbors/knn/brute_force_float_int64_t.cu
+    bench/neighbors/knn/brute_force_float_uint32_t.cu
+    bench/neighbors/knn/ivf_flat_float_int64_t.cu
+    bench/neighbors/knn/ivf_flat_float_uint32_t.cu
+    bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    bench/neighbors/knn/ivf_flat_uint8_t_uint32_t.cu
+    bench/neighbors/knn/ivf_pq_float_int64_t.cu
+    bench/neighbors/knn/ivf_pq_float_uint32_t.cu
+    bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
+    bench/neighbors/refine.cu
+    bench/neighbors/selection.cu
+    bench/main.cpp
+    OPTIONAL
+    DIST
+    NN
+  )
 endif()
-
diff --git a/cpp/bench/linalg/norm.cu b/cpp/bench/linalg/norm.cu
new file mode 100644
index 0000000000..cce4195cf1
--- /dev/null
+++ b/cpp/bench/linalg/norm.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+template <typename IdxT>
+struct norm_input {
+  IdxT rows, cols;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const norm_input<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols;
+  return os;
+}
+
+template <typename T, typename IdxT>
+struct rowNorm : public fixture {
+  rowNorm(const norm_input<IdxT>& p) : params(p), in(p.rows * p.cols, stream), dots(p.rows, stream)
+  {
+    raft::random::RngState rng{1234};
+    raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto input_view = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
+        in.data(), params.rows, params.cols);
+      auto output_view =
+        raft::make_device_vector_view<T, IdxT, raft::row_major>(dots.data(), params.rows);
+      raft::linalg::norm(handle,
+                         input_view,
+                         output_view,
+                         raft::linalg::L2Norm,
+                         raft::linalg::Apply::ALONG_ROWS,
+                         raft::SqrtOp<T>());
+    });
+  }
+
+ private:
+  norm_input<IdxT> params;
+  rmm::device_uvector<T> in, dots;
+};  // struct rowNorm
+
+const std::vector<norm_input<int>> norm_inputs_i32 =
+  raft::util::itertools::product<norm_input<int>>({10, 100, 1000, 10000, 100000},
+                                                  {16, 32, 64, 128, 256, 512, 1024});
+const std::vector<norm_input<int64_t>> norm_inputs_i64 =
+  raft::util::itertools::product<norm_input<int64_t>>({10, 100, 1000, 10000, 100000},
+                                                      {16, 32, 64, 128, 256, 512, 1024});
+
+RAFT_BENCH_REGISTER((rowNorm<float, int>), "", norm_inputs_i32);
+RAFT_BENCH_REGISTER((rowNorm<double, int>), "", norm_inputs_i32);
+RAFT_BENCH_REGISTER((rowNorm<float, int64_t>), "", norm_inputs_i64);
+RAFT_BENCH_REGISTER((rowNorm<double, int64_t>), "", norm_inputs_i64);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/linalg/normalize.cu b/cpp/bench/linalg/normalize.cu
new file mode 100644
index 0000000000..d01473ffeb
--- /dev/null
+++ b/cpp/bench/linalg/normalize.cu
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/normalize.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+template <typename IdxT>
+struct normalize_input {
+  IdxT rows, cols;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const normalize_input<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols;
+  return os;
+}
+
+template <typename T, typename IdxT>
+struct rowNormalize : public fixture {
+  rowNormalize(const normalize_input<IdxT>& p)
+    : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
+  {
+    raft::random::RngState rng{1234};
+    raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto input_view = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
+        in.data(), params.rows, params.cols);
+      auto output_view = raft::make_device_matrix_view<T, IdxT, raft::row_major>(
+        out.data(), params.rows, params.cols);
+      raft::linalg::row_normalize(handle, input_view, output_view, raft::linalg::L2Norm);
+    });
+  }
+
+ private:
+  normalize_input<IdxT> params;
+  rmm::device_uvector<T> in, out;
+};  // struct rowNormalize
+
+const std::vector<normalize_input<int>> normalize_inputs_i32 =
+  raft::util::itertools::product<normalize_input<int>>(
+    {10, 100, 1000, 10000, 100000}, {8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384});
+const std::vector<normalize_input<int64_t>> normalize_inputs_i64 =
+  raft::util::itertools::product<normalize_input<int64_t>>(
+    {10, 100, 1000, 10000, 100000}, {8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384});
+
+RAFT_BENCH_REGISTER((rowNormalize<float, int>), "", normalize_inputs_i32);
+RAFT_BENCH_REGISTER((rowNormalize<double, int>), "", normalize_inputs_i32);
+RAFT_BENCH_REGISTER((rowNormalize<float, int64_t>), "", normalize_inputs_i64);
+RAFT_BENCH_REGISTER((rowNormalize<double, int64_t>), "", normalize_inputs_i64);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/neighbors/refine.cu b/cpp/bench/neighbors/refine.cu
new file mode 100644
index 0000000000..a038905ace
--- /dev/null
+++ b/cpp/bench/neighbors/refine.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/random/rng.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/detail/refine.cuh>
+#include <raft/neighbors/refine.cuh>
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.cuh>
+#endif
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include "../../test/neighbors/refine_helper.cuh"
+
+#include <iostream>
+#include <sstream>
+
+using namespace raft::neighbors::detail;
+
+namespace raft::bench::neighbors {
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const RefineInputs<IdxT>& p) -> std::ostream&
+{
+  os << p.n_rows << "#" << p.dim << "#" << p.n_queries << "#" << p.k0 << "#" << p.k << "#"
+     << (p.host_data ? "host" : "device");
+  return os;
+}
+
+RefineInputs<int64_t> p;
+
+template <typename DataT, typename DistanceT, typename IdxT>
+class RefineAnn : public fixture {
+ public:
+  RefineAnn(RefineInputs<IdxT> p) : data(handle_, p) {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << data.p;
+    state.SetLabel(label_stream.str());
+
+    auto old_mr = rmm::mr::get_current_device_resource();
+    rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr(old_mr);
+    rmm::mr::set_current_device_resource(&pool_mr);
+
+    if (data.p.host_data) {
+      loop_on_state(state, [this]() {
+        raft::neighbors::refine<IdxT, DataT, DistanceT, IdxT>(handle_,
+                                                              data.dataset_host.view(),
+                                                              data.queries_host.view(),
+                                                              data.candidates_host.view(),
+                                                              data.refined_indices_host.view(),
+                                                              data.refined_distances_host.view(),
+                                                              data.p.metric);
+      });
+    } else {
+      loop_on_state(state, [&]() {
+        raft::neighbors::refine<IdxT, DataT, DistanceT, IdxT>(handle_,
+                                                              data.dataset.view(),
+                                                              data.queries.view(),
+                                                              data.candidates.view(),
+                                                              data.refined_indices.view(),
+                                                              data.refined_distances.view(),
+                                                              data.p.metric);
+      });
+    }
+    rmm::mr::set_current_device_resource(old_mr);
+  }
+
+ private:
+  raft::handle_t handle_;
+  RefineHelper<DataT, DistanceT, IdxT> data;
+};
+
+std::vector<RefineInputs<int64_t>> getInputs()
+{
+  std::vector<RefineInputs<int64_t>> out;
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
+  for (bool host_data : {true, false}) {
+    for (int64_t n_queries : {1000, 10000}) {
+      for (int64_t dim : {128, 512}) {
+        out.push_back(RefineInputs<int64_t>{n_queries, 2000000, dim, 32, 128, metric, host_data});
+        out.push_back(RefineInputs<int64_t>{n_queries, 2000000, dim, 10, 40, metric, host_data});
+      }
+    }
+  }
+  return out;
+}
+
+using refine_float_int64 = RefineAnn<float, float, int64_t>;
+RAFT_BENCH_REGISTER(refine_float_int64, "", getInputs());
+
+using refine_uint8_int64 = RefineAnn<uint8_t, float, int64_t>;
+RAFT_BENCH_REGISTER(refine_uint8_int64, "", getInputs());
+}  // namespace raft::bench::neighbors
diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json
new file mode 100644
index 0000000000..f7cc50e513
--- /dev/null
+++ b/cpp/cmake/config.json
@@ -0,0 +1,43 @@
+{
+  "parse": {
+    "additional_commands": {
+      "CPMFindPackage": {
+        "kwargs": {
+          "NAME": 1,
+          "GITHUB_REPOSITORY": "?",
+          "GIT_TAG": "?",
+          "VERSION": "?",
+          "GIT_SHALLOW": "?",
+          "OPTIONS": "*",
+          "FIND_PACKAGE_ARGUMENTS": "*" 
+        }
+      },
+      "ConfigureTest": {
+        "flags": ["TEST_NAME", "TEST_SRC"]
+      },
+      "ConfigureBench": {
+        "flags": ["BENCH_NAME", "BENCH_SRC"]
+      }
+    }
+  },
+  "format": {
+    "line_width": 100,
+    "tab_size": 2,
+    "command_case": "unchanged",
+    "max_lines_hwrap": 1,
+    "max_pargs_hwrap": 999,
+    "dangle_parens": true
+  },
+  "lint": {
+    "disabled_codes": ["C0301", "C0111", "C0113"],
+    "function_pattern": "[0-9A-z_]+",
+    "macro_pattern": "[0-9A-z_]+",
+    "global_var_pattern": "[A-z][0-9A-z_]+",
+    "internal_var_pattern": "_[A-z][0-9A-z_]+",
+    "local_var_pattern": "[A-z][A-z0-9_]+",
+    "private_var_pattern": "_[0-9A-z_]+",
+    "public_var_pattern": "[A-z][0-9A-z_]+",
+    "argument_var_pattern": "[A-z][A-z0-9_]+",
+    "keyword_pattern": "[A-z][0-9A-z_]+"
+  }
+}
diff --git a/cpp/cmake/doxygen.cmake b/cpp/cmake/doxygen.cmake
index 7d06ec194c..a6fddbacd6 100644
--- a/cpp/cmake/doxygen.cmake
+++ b/cpp/cmake/doxygen.cmake
@@ -1,16 +1,14 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
 #
 
 find_package(Doxygen 1.8.11)
@@ -24,11 +22,13 @@ function(add_doxygen_target)
     configure_file(${dox_IN_DOXYFILE} ${dox_OUT_DOXYFILE} @ONLY)
 
     message("Command: ${DOXYGEN_EXECUTABLE} ${dox_OUT_DOXYFILE}")
-    add_custom_target(docs_raft
+    add_custom_target(
+      docs_raft
       ${DOXYGEN_EXECUTABLE} ${dox_OUT_DOXYFILE}
       WORKING_DIRECTORY ${dox_CWD}
       VERBATIM
-      COMMENT "Generate doxygen docs")
+      COMMENT "Generate doxygen docs"
+    )
   else()
     message("add_doxygen_target: doxygen exe not found")
   endif()
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 440c8c4f3a..5e68ca5bc4 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,26 +1,24 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 if(DISABLE_DEPRECATION_WARNINGS)
-    list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations)
-    list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
+  list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
 endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
-    list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
+  list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
 endif()
 
 list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
@@ -31,21 +29,23 @@ list(APPEND RAFT_CUDA_FLAGS -Xfatbin=-compress-all)
 
 # set warnings as errors
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
-    list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
+  list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
 endif()
 list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
-# Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
+# Option to enable line info in CUDA device compilation to allow introspection when profiling /
+# memchecking
 if(CUDA_ENABLE_LINEINFO)
-    list(APPEND RAFT_CUDA_FLAGS -lineinfo)
+  list(APPEND RAFT_CUDA_FLAGS -lineinfo)
 endif()
 
 if(OpenMP_FOUND)
-    list(APPEND RAFT_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
 endif()
 
 # Debug options
 if(CMAKE_BUILD_TYPE MATCHES Debug)
-    message(VERBOSE "RAFT: Building with debugging flags")
-    list(APPEND RAFT_CUDA_FLAGS -G -Xcompiler=-rdynamic)
+  message(VERBOSE "RAFT: Building with debugging flags")
+  list(APPEND RAFT_CUDA_FLAGS -G -Xcompiler=-rdynamic)
+  list(APPEND RAFT_CUDA_FLAGS -Xptxas --suppress-stack-size-warning)
 endif()
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 748fa8ad26..bcc3578bf8 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -1,18 +1,16 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 include_guard(GLOBAL)
 
 #[=======================================================================[.rst:
@@ -41,14 +39,14 @@ calls to :cmake:command:`find_dependency`, or :cmake:command:`CPMFindPackage`.
 .. note::
   :cmake:command:`raft_export` always installs to `lib` and doesn't use GNUInstallDirs
 
-  The files generated by :cmake:command:`raft_export` are completly standalone
+  The files generated by :cmake:command:`raft_export` are completely standalone
   and don't require the consuming package to use `rapids-cmake`
 
 ``project_name``
   Name of the project, to be used by consumers when using `find_package`
 
 ``GLOBAL_TARGETS``
-  Explicitly list what targets should be made globally visibile to
+  Explicitly list what targets should be made globally visible to
   the consuming project.
 
 ``VERSION``
@@ -61,9 +59,9 @@ calls to :cmake:command:`find_dependency`, or :cmake:command:`CPMFindPackage`.
   Depending on the version string different compatibility modes will be used.
 
     +------------------+---------------------+
-    | Version String   | Compatiblity Type   |
+    | Version String   | Compatibility Type  |
     +==================+=====================+
-    | None             | No checks perfomed  |
+    | None             | No checks performed |
     +------------------+---------------------+
     | X                | SameMajorVersion    |
     +------------------+---------------------+
@@ -174,19 +172,26 @@ function(raft_export type project_name)
 
     set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export/raft/")
 
-    configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
-            "${scratch_dir}/${project_name}-config.cmake"
-            INSTALL_DESTINATION "${install_location}")
+    configure_package_config_file(
+      "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
+      "${scratch_dir}/${project_name}-config.cmake" INSTALL_DESTINATION "${install_location}"
+    )
 
     if(rapids_version_set)
       write_basic_package_version_file(
-              "${scratch_dir}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
-              COMPATIBILITY ${rapids_project_version_compat})
+        "${scratch_dir}/${project_name}-config-version.cmake"
+        VERSION ${rapids_project_version}
+        COMPATIBILITY ${rapids_project_version_compat}
+      )
     endif()
 
-    install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
-            NAMESPACE ${RAPIDS_PROJECT_VERSION} DESTINATION "${install_location}"
-            COMPONENT raft)
+    install(
+      EXPORT ${RAPIDS_EXPORT_SET}
+      FILE ${project_name}-targets.cmake
+      NAMESPACE ${RAPIDS_PROJECT_VERSION}
+      DESTINATION "${install_location}"
+      COMPONENT raft
+    )
 
     if(TARGET rapids_export_install_${RAPIDS_EXPORT_SET})
       include("${rapids-cmake-dir}/export/write_dependencies.cmake")
@@ -203,41 +208,55 @@ function(raft_export type project_name)
     endif()
 
     # Install everything we have generated
-    install(DIRECTORY "${scratch_dir}/" DESTINATION "${install_location}"
-            COMPONENT raft)
+    install(
+      DIRECTORY "${scratch_dir}/"
+      DESTINATION "${install_location}"
+      COMPONENT raft
+    )
     foreach(comp nn distance)
       set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export/${comp}/")
       file(MAKE_DIRECTORY "${scratch_dir}")
-      install(DIRECTORY "${scratch_dir}" DESTINATION "${install_location}"
-        COMPONENT ${comp})
+      install(
+        DIRECTORY "${scratch_dir}"
+        DESTINATION "${install_location}"
+        COMPONENT ${comp}
+      )
     endforeach()
 
   else()
     set(install_location "${PROJECT_BINARY_DIR}")
-    configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
-            "${install_location}/${project_name}-config.cmake"
-            INSTALL_DESTINATION "${install_location}")
+    configure_package_config_file(
+      "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
+      "${install_location}/${project_name}-config.cmake" INSTALL_DESTINATION "${install_location}"
+    )
 
     if(rapids_version_set)
       write_basic_package_version_file(
-              "${install_location}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
-              COMPATIBILITY ${rapids_project_version_compat})
+        "${install_location}/${project_name}-config-version.cmake"
+        VERSION ${rapids_project_version}
+        COMPATIBILITY ${rapids_project_version_compat}
+      )
     endif()
 
-    export(EXPORT ${RAPIDS_EXPORT_SET} NAMESPACE ${RAPIDS_PROJECT_VERSION}
-            FILE "${install_location}/${project_name}-targets.cmake")
+    export(
+      EXPORT ${RAPIDS_EXPORT_SET}
+      NAMESPACE ${RAPIDS_PROJECT_VERSION}
+      FILE "${install_location}/${project_name}-targets.cmake"
+    )
 
     if(TARGET rapids_export_build_${RAPIDS_EXPORT_SET})
       include("${rapids-cmake-dir}/export/write_dependencies.cmake")
-      rapids_export_write_dependencies(BUILD ${RAPIDS_EXPORT_SET}
-              "${install_location}/${project_name}-dependencies.cmake")
+      rapids_export_write_dependencies(
+        BUILD ${RAPIDS_EXPORT_SET} "${install_location}/${project_name}-dependencies.cmake"
+      )
     endif()
 
     if(DEFINED RAPIDS_LANGUAGES)
       include("${rapids-cmake-dir}/export/write_language.cmake")
       foreach(lang IN LISTS RAPIDS_LANGUAGES)
-        rapids_export_write_language(BUILD ${lang}
-                "${install_location}/${project_name}-${lang}-language.cmake")
+        rapids_export_write_language(
+          BUILD ${lang} "${install_location}/${project_name}-${lang}-language.cmake"
+        )
       endforeach()
     endif()
 
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
new file mode 100644
index 0000000000..811a5466c3
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -0,0 +1,99 @@
+# =============================================================================
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+function(find_and_configure_cutlass)
+  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  # if(RAFT_ENABLE_DIST_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES)
+  set(CUTLASS_ENABLE_HEADERS_ONLY
+      ON
+      CACHE BOOL "Enable only the header library"
+  )
+  set(CUTLASS_NAMESPACE
+      "raft_cutlass"
+      CACHE STRING "Top level namespace of CUTLASS"
+  )
+  set(CUTLASS_ENABLE_CUBLAS
+      OFF
+      CACHE BOOL "Disable CUTLASS to build with cuBLAS library."
+  )
+
+  rapids_cpm_find(
+    NvidiaCutlass ${PKG_VERSION}
+    GLOBAL_TARGETS nvidia::cutlass::cutlass
+    CPM_ARGS
+    GIT_REPOSITORY ${PKG_REPOSITORY}
+    GIT_TAG ${PKG_PINNED_TAG}
+    GIT_SHALLOW TRUE
+    OPTIONS "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
+  )
+
+  if(TARGET CUTLASS AND NOT TARGET nvidia::cutlass::cutlass)
+    add_library(nvidia::cutlass::cutlass ALIAS CUTLASS)
+  endif()
+
+  if(NvidiaCutlass_ADDED)
+    rapids_export(
+      BUILD NvidiaCutlass
+      EXPORT_SET NvidiaCutlass
+      GLOBAL_TARGETS nvidia::cutlass::cutlass
+      NAMESPACE nvidia::cutlass::
+    )
+  endif()
+  # endif()
+
+  # We generate the cutlass-config files when we built cutlass locally, so always do
+  # `find_dependency`
+  rapids_export_package(
+          BUILD NvidiaCutlass raft-distance-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
+  )
+  rapids_export_package(
+          INSTALL NvidiaCutlass raft-distance-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
+  )
+  rapids_export_package(
+          BUILD NvidiaCutlass raft-nn-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
+  )
+  rapids_export_package(
+          INSTALL NvidiaCutlass raft-nn-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
+  )
+
+  # Tell cmake where it can find the generated NvidiaCutlass-config.cmake we wrote.
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+          INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] raft-distance-exports
+  )
+  rapids_export_find_package_root(
+          BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-distance-exports
+  )
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+          INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] raft-nn-exports
+  )
+  rapids_export_find_package_root(
+          BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-nn-exports
+  )
+endfunction()
+
+if(NOT RAFT_CUTLASS_GIT_TAG)
+  set(RAFT_CUTLASS_GIT_TAG v2.9.1)
+endif()
+
+if(NOT RAFT_CUTLASS_GIT_REPOSITORY)
+  set(RAFT_CUTLASS_GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git)
+endif()
+
+find_and_configure_cutlass(
+  VERSION 2.9.1 REPOSITORY ${RAFT_CUTLASS_GIT_REPOSITORY} PINNED_TAG ${RAFT_CUTLASS_GIT_TAG}
+)
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index 6a4f323c58..e80d9a85fa 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -28,10 +28,10 @@ namespace raft {
 
 /* Function for testing RAFT include
  *
- * @return message indicating RAFT has been included succesfully*/
+ * @return message indicating RAFT has been included successfully*/
 inline std::string test_raft()
 {
-  std::string status = "RAFT Setup succesfully";
+  std::string status = "RAFT Setup successfully";
   return status;
 }
 
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 060d05a333..6cfa3156c9 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -942,7 +942,7 @@ void kmeans_fit(handle_t const& handle,
       RAFT_LOG_DEBUG(
         "KMeans.fit (Iteration-%d/%d): initialize cluster centers from "
         "the ndarray array input "
-        "passed to init arguement.",
+        "passed to init argument.",
         seed_iter + 1,
         n_init);
       raft::copy(
@@ -1029,7 +1029,7 @@ void kmeans_predict(handle_t const& handle,
   auto metric = params.metric;
 
   // Allocate memory
-  // Device-accessible allocation of expandable storage used as temorary buffers
+  // Device-accessible allocation of expandable storage used as temporary buffers
   rmm::device_uvector<char> workspace(0, stream);
   auto weight = raft::make_device_vector<DataT, IndexT>(handle, n_samples);
   if (sample_weight.has_value())
@@ -1226,7 +1226,7 @@ void kmeans_transform(const raft::handle_t& handle,
   auto n_clusters     = params.n_clusters;
   auto metric         = params.metric;
 
-  // Device-accessible allocation of expandable storage used as temorary buffers
+  // Device-accessible allocation of expandable storage used as temporary buffers
   rmm::device_uvector<char> workspace(0, stream);
   auto dataBatchSize = getDataBatchSize(params.batch_samples, n_samples);
 
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index 2a35c1efa0..d64815244b 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -271,7 +271,7 @@ void transform(const raft::handle_t& handle,
  *                                [dim = n_samples x n_features]
  * @param[in]  minClusterDistance Distance for every sample to it's nearest centroid
  *                                [dim = n_samples]
- * @param[in]  isSampleCentroid   Flag the sample choosen as initial centroid
+ * @param[in]  isSampleCentroid   Flag the sample chosen as initial centroid
  *                                [dim = n_samples]
  * @param[in]  select_op          The sampling operation used to select the centroids
  * @param[out] inRankCp           The sampled centroids
@@ -798,7 +798,7 @@ using KeyValueIndexOp = kmeans::KeyValueIndexOp<IndexT, DataT>;
  *                                [dim = n_samples x n_features]
  * @param[in]  minClusterDistance Distance for every sample to it's nearest centroid
  *                                [dim = n_samples]
- * @param[in]  isSampleCentroid   Flag the sample choosen as initial centroid
+ * @param[in]  isSampleCentroid   Flag the sample chosen as initial centroid
  *                                [dim = n_samples]
  * @param[in]  select_op          The sampling operation used to select the centroids
  * @param[out] inRankCp           The sampled centroids
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index e64c6d9bf0..33892597d8 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -88,7 +88,7 @@ class std_comms : public comms_iface {
 
   /**
    * @brief constructor for collective-only operation
-   * @param nccl_comm initilized nccl communicator
+   * @param nccl_comm initialized nccl communicator
    * @param num_ranks size of the cluster
    * @param rank rank of the current worker
    * @param stream stream for ordering collective operations
@@ -266,7 +266,7 @@ class std_comms : public comms_iface {
         bool restart = false;  // resets the timeout when any progress was made
 
         // Causes UCP to progress through the send/recv message queue
-        while (ucp_handler_.ucp_progress(ucp_worker_) != 0) {
+        while (ucp_worker_progress(ucp_worker_) != 0) {
           restart = true;
         }
 
diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp
index 668acafae4..9479bc24f9 100644
--- a/cpp/include/raft/comms/detail/ucp_helper.hpp
+++ b/cpp/include/raft/comms/detail/ucp_helper.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <dlfcn.h>
 #include <raft/util/cudart_utils.hpp>
 #include <stdio.h>
 #include <ucp/api/ucp.h>
@@ -26,23 +25,6 @@ namespace raft {
 namespace comms {
 namespace detail {
 
-typedef void (*dlsym_print_info)(ucp_ep_h, FILE*);
-
-typedef void (*dlsym_rec_free)(void*);
-
-typedef int (*dlsym_worker_progress)(ucp_worker_h);
-
-typedef ucs_status_ptr_t (*dlsym_send)(
-  ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t);
-
-typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h,
-                                       void*,
-                                       size_t count,
-                                       ucp_datatype_t datatype,
-                                       ucp_tag_t,
-                                       ucp_tag_t,
-                                       ucp_tag_recv_callback_t);
-
 /**
  * Standard UCX request object that will be passed
  * around asynchronously. This object is really
@@ -90,79 +72,10 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_
 }
 
 /**
- * Helper class for managing `dlopen` state and
- * interacting with ucp.
+ * Helper class for interacting with ucp.
  */
 class comms_ucp_handler {
- public:
-  comms_ucp_handler()
-  {
-    load_ucp_handle();
-    load_send_func();
-    load_recv_func();
-    load_free_req_func();
-    load_print_info_func();
-    load_worker_progress_func();
-  }
-
-  ~comms_ucp_handler() { dlclose(ucp_handle); }
-
  private:
-  void* ucp_handle;
-
-  dlsym_print_info print_info_func;
-  dlsym_rec_free req_free_func;
-  dlsym_worker_progress worker_progress_func;
-  dlsym_send send_func;
-  dlsym_recv recv_func;
-
-  void load_ucp_handle()
-  {
-    ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE);
-    if (!ucp_handle) {
-      ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE);
-      ASSERT(ucp_handle, "Cannot open UCX library: %s\n", dlerror());
-    }
-    // Reset any potential error
-    dlerror();
-  }
-
-  void assert_dlerror()
-  {
-    char* error = dlerror();
-    ASSERT(error == NULL, "Error loading function symbol: %s\n", error);
-  }
-
-  void load_send_func()
-  {
-    send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb");
-    assert_dlerror();
-  }
-
-  void load_free_req_func()
-  {
-    req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free");
-    assert_dlerror();
-  }
-
-  void load_print_info_func()
-  {
-    print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info");
-    assert_dlerror();
-  }
-
-  void load_worker_progress_func()
-  {
-    worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
-    assert_dlerror();
-  }
-
-  void load_recv_func()
-  {
-    recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb");
-    assert_dlerror();
-  }
-
   ucp_tag_t build_message_tag(int rank, int tag) const
   {
     // keeping the rank in the lower bits enables debugging.
@@ -170,8 +83,6 @@ class comms_ucp_handler {
   }
 
  public:
-  int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); }
-
   /**
    * @brief Frees any memory underlying the given ucp request object
    */
@@ -179,7 +90,7 @@ class comms_ucp_handler {
   {
     if (request->needs_release) {
       request->req->completed = 0;
-      (*(req_free_func))(request->req);
+      ucp_request_free(request->req);
     }
     free(request);
   }
@@ -198,7 +109,7 @@ class comms_ucp_handler {
     ucp_tag_t ucp_tag = build_message_tag(rank, tag);
 
     ucs_status_ptr_t send_result =
-      (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
+      ucp_tag_send_nb(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
     struct ucx_context* ucp_req = (struct ucx_context*)send_result;
 
     if (UCS_PTR_IS_ERR(send_result)) {
@@ -240,7 +151,7 @@ class comms_ucp_handler {
     ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag);
 
     ucs_status_ptr_t recv_result =
-      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback);
+      ucp_tag_recv_nb(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback);
 
     struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
 
diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp
index 771f38fee3..78ce91dbf2 100644
--- a/cpp/include/raft/core/comms.hpp
+++ b/cpp/include/raft/core/comms.hpp
@@ -32,7 +32,7 @@ enum class op_t { SUM, PROD, MIN, MAX };
  */
 enum class status_t {
   SUCCESS,  // Synchronization successful
-  ERROR,    // An error occured querying sync status
+  ERROR,    // An error occurred querying sync status
   ABORT     // A failure occurred in sync, queued operations aborted
 };
 
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index 3386610224..f64f15d0d5 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -44,7 +44,6 @@ template <typename ElementType,
           typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
 using managed_mdspan = mdspan<ElementType, Extents, LayoutPolicy, managed_accessor<AccessorPolicy>>;
 
-namespace detail {
 template <typename T, bool B>
 struct is_device_mdspan : std::false_type {
 };
@@ -83,22 +82,20 @@ using is_input_managed_mdspan_t = is_managed_mdspan<T, is_input_mdspan_v<T>>;
 template <typename T>
 using is_output_managed_mdspan_t = is_managed_mdspan<T, is_output_mdspan_v<T>>;
 
-}  // end namespace detail
-
 /**
  * @\brief Boolean to determine if variadic template types Tn are either raft::device_mdspan or a
  * derived type
  */
 template <typename... Tn>
-inline constexpr bool is_device_mdspan_v = std::conjunction_v<detail::is_device_mdspan_t<Tn>...>;
+inline constexpr bool is_device_mdspan_v = std::conjunction_v<is_device_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 inline constexpr bool is_input_device_mdspan_v =
-  std::conjunction_v<detail::is_input_device_mdspan_t<Tn>...>;
+  std::conjunction_v<is_input_device_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 inline constexpr bool is_output_device_mdspan_v =
-  std::conjunction_v<detail::is_output_device_mdspan_t<Tn>...>;
+  std::conjunction_v<is_output_device_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 using enable_if_device_mdspan = std::enable_if_t<is_device_mdspan_v<Tn...>>;
@@ -114,15 +111,15 @@ using enable_if_output_device_mdspan = std::enable_if_t<is_output_device_mdspan_
  * derived type
  */
 template <typename... Tn>
-inline constexpr bool is_managed_mdspan_v = std::conjunction_v<detail::is_managed_mdspan_t<Tn>...>;
+inline constexpr bool is_managed_mdspan_v = std::conjunction_v<is_managed_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 inline constexpr bool is_input_managed_mdspan_v =
-  std::conjunction_v<detail::is_input_managed_mdspan_t<Tn>...>;
+  std::conjunction_v<is_input_managed_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 inline constexpr bool is_output_managed_mdspan_v =
-  std::conjunction_v<detail::is_output_managed_mdspan_t<Tn>...>;
+  std::conjunction_v<is_output_managed_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 using enable_if_managed_mdspan = std::enable_if_t<is_managed_mdspan_v<Tn...>>;
@@ -292,18 +289,6 @@ auto make_device_vector_view(
   return device_vector_view<ElementType, IndexType, LayoutPolicy>{ptr, mapping};
 }
 
-/**
- * @brief Create a layout_stride mapping from extents and strides
- * @param[in] extents the dimensionality of the layout
- * @param[in] strides the strides between elements in the layout
- * @return raft::layout_stride::mapping<Extents>
- */
-template <typename Extents, typename Strides>
-auto make_strided_layout(Extents extents, Strides strides)
-{
-  return layout_stride::mapping<Extents>{extents, strides};
-}
-
 /**
  * @brief Construct a strided vector layout mapping
  *
diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp
index 8348595db3..b932309d24 100644
--- a/cpp/include/raft/core/error.hpp
+++ b/cpp/include/raft/core/error.hpp
@@ -97,23 +97,23 @@ struct logic_error : public raft::exception {
 
 // FIXME: Need to be replaced with RAFT_FAIL
 /** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                      \
-  do {                                                                                       \
-    int size1 =                                                                              \
-      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
-    if (size1 < 0 || size2 < 0)                                                              \
-      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
-    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
-    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
-    std::snprintf(buf.get(),                                                                 \
-                  size1 + 1 /* +1 for '\0' */,                                               \
-                  "exception occured! file=%s line=%d: ",                                    \
-                  __FILE__,                                                                  \
-                  __LINE__);                                                                 \
-    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
-    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
-    throw raft::exception(msg);                                                              \
+#define THROW(fmt, ...)                                                                       \
+  do {                                                                                        \
+    int size1 =                                                                               \
+      std::snprintf(nullptr, 0, "exception occurred! file=%s line=%d: ", __FILE__, __LINE__); \
+    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                \
+    if (size1 < 0 || size2 < 0)                                                               \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");              \
+    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                    \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                       \
+    std::snprintf(buf.get(),                                                                  \
+                  size1 + 1 /* +1 for '\0' */,                                                \
+                  "exception occurred! file=%s line=%d: ",                                    \
+                  __FILE__,                                                                   \
+                  __LINE__);                                                                  \
+    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);        \
+    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */           \
+    throw raft::exception(msg);                                                               \
   } while (0)
 
 // FIXME: Need to be replaced with RAFT_EXPECTS
@@ -148,7 +148,7 @@ struct logic_error : public raft::exception {
  *
  * @param[in] cond Expression that evaluates to true or false
  * @param[in] fmt String literal description of the reason that cond is expected to be true with
- * optinal format tagas
+ * optional format tagas
  * @throw raft::logic_error if the condition evaluates to false.
  */
 #define RAFT_EXPECTS(cond, fmt, ...)                              \
@@ -164,7 +164,7 @@ struct logic_error : public raft::exception {
  * @brief Indicates that an erroneous code path has been taken.
  *
  * @param[in] fmt String literal description of the reason that this code path is erroneous with
- * optinal format tagas
+ * optional format tagas
  * @throw always throws raft::logic_error
  */
 #define RAFT_FAIL(fmt, ...)                                     \
diff --git a/cpp/include/raft/core/host_device_accessor.hpp b/cpp/include/raft/core/host_device_accessor.hpp
index 81bf015f2e..e9ebdb6c9f 100644
--- a/cpp/include/raft/core/host_device_accessor.hpp
+++ b/cpp/include/raft/core/host_device_accessor.hpp
@@ -22,7 +22,7 @@ namespace raft {
 
 /**
  * @brief A mixin to distinguish host and device memory. This is the primary
- * accessor used throught RAFT's APIs to denote whether an underlying pointer
+ * accessor used throughout RAFT's APIs to denote whether an underlying pointer
  * is accessible from device, host, or both.
  */
 template <typename AccessorPolicy, memory_type MemType>
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index d3d6c53df3..1a0ea6432f 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -36,8 +36,6 @@ template <typename ElementType,
           typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
 using host_mdspan = mdspan<ElementType, Extents, LayoutPolicy, host_accessor<AccessorPolicy>>;
 
-namespace detail {
-
 template <typename T, bool B>
 struct is_host_mdspan : std::false_type {
 };
@@ -57,22 +55,18 @@ using is_input_host_mdspan_t = is_host_mdspan<T, is_input_mdspan_v<T>>;
 template <typename T>
 using is_output_host_mdspan_t = is_host_mdspan<T, is_output_mdspan_v<T>>;
 
-}  // namespace detail
-
 /**
  * @\brief Boolean to determine if variadic template types Tn are either raft::host_mdspan or a
  * derived type
  */
 template <typename... Tn>
-inline constexpr bool is_host_mdspan_v = std::conjunction_v<detail::is_host_mdspan_t<Tn>...>;
+inline constexpr bool is_host_mdspan_v = std::conjunction_v<is_host_mdspan_t<Tn>...>;
 
 template <typename... Tn>
-inline constexpr bool is_input_host_mdspan_v =
-  std::conjunction_v<detail::is_input_host_mdspan_t<Tn>...>;
+inline constexpr bool is_input_host_mdspan_v = std::conjunction_v<is_input_host_mdspan_t<Tn>...>;
 
 template <typename... Tn>
-inline constexpr bool is_output_host_mdspan_v =
-  std::conjunction_v<detail::is_output_host_mdspan_t<Tn>...>;
+inline constexpr bool is_output_host_mdspan_v = std::conjunction_v<is_output_host_mdspan_t<Tn>...>;
 
 template <typename... Tn>
 using enable_if_host_mdspan = std::enable_if_t<is_input_mdspan_v<Tn...>>;
diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp
index db131ff6fa..786ce69f89 100644
--- a/cpp/include/raft/core/mdspan.hpp
+++ b/cpp/include/raft/core/mdspan.hpp
@@ -194,26 +194,15 @@ auto make_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
 }
 
 /**
- * @brief Create a raft::mdspan
- * @tparam ElementType the data type of the matrix elements
- * @tparam IndexType the index type of the extents
- * @tparam LayoutPolicy policy for strides and layout ordering
- * @tparam MemType the raft::memory_type for where the data are stored
- * @param ptr Pointer to the data
- * @param exts dimensionality of the array (series of integers)
- * @return raft::mdspan
+ * @brief Create a layout_stride mapping from extents and strides
+ * @param[in] extents the dimensionality of the layout
+ * @param[in] strides the strides between elements in the layout
+ * @return raft::layout_stride::mapping<Extents>
  */
-template <typename ElementType,
-          typename IndexType    = std::uint32_t,
-          typename LayoutPolicy = layout_c_contiguous,
-          memory_type MemType   = memory_type::device,
-          size_t... Extents>
-auto make_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
+template <typename Extents, typename Strides>
+auto make_strided_layout(Extents extents, Strides strides)
 {
-  using accessor_type =
-    host_device_accessor<std::experimental::default_accessor<ElementType>, MemType>;
-
-  return mdspan<ElementType, decltype(exts), LayoutPolicy, accessor_type>{ptr, exts};
+  return layout_stride::mapping<Extents>{extents, strides};
 }
 
 /**
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
index 3dbe1dd511..09a41f10a6 100644
--- a/cpp/include/raft/core/nvtx.hpp
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -32,7 +32,7 @@
  * \code{.cpp}
  * #include <raft/common/nvtx.hpp>
  * void some_function(int k){
- *   // Begins a NVTX range with the messsage "some_function_{k}"
+ *   // Begins a NVTX range with the message "some_function_{k}"
  *   // The range ends when some_function() returns
  *   common::nvtx::range fun_scope( r{"some_function_%d", k};
  *
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index b7eed3e2a8..f06051962f 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -17,12 +17,23 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
 namespace detail {
 
+template <typename DataT, typename AccT>
+struct CosineOp {
+  __device__ CosineOp() noexcept {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
+  }
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
 /**
  * @brief the cosine distance matrix calculation implementer
  *  It computes the following equation:
@@ -71,61 +82,74 @@ void cosineImpl(const DataT* x,
                 FinalLambda fin_op,
                 cudaStream_t stream)
 {
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+#if (__CUDACC_VER_MAJOR__ < 12)
+  const auto deviceVersion = getComputeCapability();
+  if (deviceVersion.first >= 8) {
+    using CosineOp_ = CosineOp<DataT, AccT>;
+    CosineOp_ cosine_dist_op;
+
+    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, CosineOp_, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, cosine_dist_op, stream);
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  } else
+#endif
+  {
+    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  dim3 blk(KPolicy::Nthreads);
+    typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
+    dim3 blk(KPolicy::Nthreads);
 
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
+    // Accumulation operation lambda
+    auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
+
+    // epilogue operation lambda for final value calculation
+    auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                       DataT * regxn,
+                                       DataT * regyn,
+                                       IdxT gridStrideX,
+                                       IdxT gridStrideY) {
 #pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = acc[i][j] / (regxn[i] * regyn[j]);
+        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+          acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]));
+        }
       }
-    }
-  };
+    };
 
-  constexpr size_t shmemSize =
-    KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-  if (isRowMajor) {
-    auto cosineRowMajor = pairwiseDistanceMatKernel<true,
-                                                    DataT,
-                                                    AccT,
-                                                    OutT,
-                                                    IdxT,
-                                                    KPolicy,
-                                                    decltype(core_lambda),
-                                                    decltype(epilog_lambda),
-                                                    FinalLambda,
-                                                    true>;
-    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
-    cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto cosineColMajor = pairwiseDistanceMatKernel<true,
-                                                    DataT,
-                                                    AccT,
-                                                    OutT,
-                                                    IdxT,
-                                                    KPolicy,
-                                                    decltype(core_lambda),
-                                                    decltype(epilog_lambda),
-                                                    FinalLambda,
-                                                    false>;
-    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
-    cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+    constexpr size_t shmemSize =
+      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
+    if (isRowMajor) {
+      auto cosineRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   true>;
+      dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
+      cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
+        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+    } else {
+      auto cosineColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   false>;
+      dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
+      cosineColMajor<<<grid, blk, shmemSize, stream>>>(
+        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+    }
   }
 
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -207,13 +231,11 @@ void cosineAlgo1(Index_ m,
 {
   auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); };
 
-  // Wrap fin_op to allow computing 1 - pA before calling fin_op
-  auto wrapped_fin_op = [fin_op] __device__(AccType d_val, Index_ g_d_idx) {
-    return fin_op(static_cast<AccType>(1.0) - d_val, g_d_idx);
-  };
-
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type CosOutType;
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
+                "OutType can be uint8_t, float, double,"
+                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
+  typedef typename std::conditional<sizeof(OutType) == 1, OutType, AccType>::type CosOutType;
   CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
 
   ASSERT(
@@ -234,12 +256,12 @@ void cosineAlgo1(Index_ m,
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream);
+    cosine<InType, AccType, CosOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream);
+    cosine<InType, AccType, CosOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index fa0c7a48cc..b459c73bee 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -615,6 +615,19 @@ void distance(const InType* x,
  * @note if workspace is passed as nullptr, this will return in
  *  worksize, the number of bytes of workspace required
  */
+
+// Default final op functor which facilitates elementwise operation on
+// final distance value if any.
+template <typename AccType, typename OutType, typename Index>
+struct default_fin_op {
+  __host__ __device__ default_fin_op() noexcept {};
+  // functor signature.
+  __host__ __device__ OutType operator()(AccType d_val, Index g_d_idx) const noexcept
+  {
+    return d_val;
+  }
+};
+
 template <raft::distance::DistanceType distanceType,
           typename InType,
           typename AccType,
@@ -632,9 +645,15 @@ void distance(const InType* x,
               bool isRowMajor   = true,
               InType metric_arg = 2.0f)
 {
-  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; };
-  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op), Index_>(
-    x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg);
+  using final_op_type = default_fin_op<AccType, OutType, Index_>;
+  final_op_type fin_op;
+
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
+                "OutType can be uint8_t, float, double,"
+                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
+  distance<distanceType, InType, AccType, OutType, final_op_type, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -651,7 +670,7 @@ void distance(const InType* x,
  * @param n number of points in y
  * @param k dimensionality
  *
- * @note If the specifed distanceType doesn't need the workspace at all, it
+ * @note If the specified distanceType doesn't need the workspace at all, it
  * returns 0.
  */
 template <raft::distance::DistanceType distanceType,
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index d83e81b6a9..5ea74fa884 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -15,13 +15,30 @@
  */
 
 #pragma once
+
 #include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
 namespace detail {
 
+template <typename DataT, typename AccT>
+struct L2ExpandedOp {
+  bool sqrt;
+
+  __device__ L2ExpandedOp() noexcept : sqrt(false) {}
+  __device__ L2ExpandedOp(bool isSqrt) noexcept : sqrt(isSqrt) {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
+    return sqrt ? raft::mySqrt(outVal) : outVal;
+  }
+
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
 /**
  * @brief the expanded euclidean distance matrix calculation implementer
  *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
@@ -71,71 +88,85 @@ void euclideanExpImpl(const DataT* x,
                       FinalLambda fin_op,
                       cudaStream_t stream)
 {
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+#if (__CUDACC_VER_MAJOR__ < 12)
+  const auto deviceVersion = getComputeCapability();
+  if (deviceVersion.first >= 8) {
+    using L2Op = L2ExpandedOp<DataT, AccT>;
+    L2Op L2_dist_op(sqrt);
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, L2Op, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
 
-  dim3 blk(KPolicy::Nthreads);
+  } else
+#endif
+  {
 
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
+    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                         DataT * regxn,
-                                         DataT * regyn,
-                                         IdxT gridStrideX,
-                                         IdxT gridStrideY) {
+    typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+    dim3 blk(KPolicy::Nthreads);
+
+    // Accumulation operation lambda
+    auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
+
+    // epilogue operation lambda for final value calculation
+    auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                           DataT * regxn,
+                                           DataT * regyn,
+                                           IdxT gridStrideX,
+                                           IdxT gridStrideY) {
 #pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
+        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+          acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
+        }
       }
-    }
-    if (sqrt) {
+      if (sqrt) {
 #pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+        for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = raft::mySqrt(acc[i][j]);
+          for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+            acc[i][j] = raft::mySqrt(acc[i][j]);
+          }
         }
       }
-    }
-  };
+    };
 
-  constexpr size_t shmemSize =
-    KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-  if (isRowMajor) {
-    auto euclideanExpRowMajor = pairwiseDistanceMatKernel<true,
-                                                          DataT,
-                                                          AccT,
-                                                          OutT,
-                                                          IdxT,
-                                                          KPolicy,
-                                                          decltype(core_lambda),
-                                                          decltype(epilog_lambda),
-                                                          FinalLambda,
-                                                          true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
-
-    euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto euclideanExpColMajor = pairwiseDistanceMatKernel<true,
-                                                          DataT,
-                                                          AccT,
-                                                          OutT,
-                                                          IdxT,
-                                                          KPolicy,
-                                                          decltype(core_lambda),
-                                                          decltype(epilog_lambda),
-                                                          FinalLambda,
-                                                          false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
-    euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+    constexpr size_t shmemSize =
+      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
+    if (isRowMajor) {
+      auto euclideanExpRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                         DataT,
+                                                                         AccT,
+                                                                         OutT,
+                                                                         IdxT,
+                                                                         KPolicy,
+                                                                         decltype(core_lambda),
+                                                                         decltype(epilog_lambda),
+                                                                         FinalLambda,
+                                                                         true>;
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
+
+      euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
+        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+    } else {
+      auto euclideanExpColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
+                                                                         DataT,
+                                                                         AccT,
+                                                                         OutT,
+                                                                         IdxT,
+                                                                         KPolicy,
+                                                                         decltype(core_lambda),
+                                                                         decltype(epilog_lambda),
+                                                                         FinalLambda,
+                                                                         false>;
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
+      euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
+        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+    }
   }
 
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -164,6 +195,7 @@ void euclideanExp(IdxT m,
 {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
+
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
     euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -217,8 +249,11 @@ void euclideanAlgo1(Index_ m,
 {
   auto norm_op = [] __device__(InType in) { return in; };
 
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type ExpOutType;
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
+                "OutType can be uint8_t, float, double,"
+                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
+  typedef typename std::conditional<sizeof(OutType) == 1, OutType, AccType>::type ExpOutType;
   ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
 
   ASSERT(
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 1385d0aa09..e8c2648c2e 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -174,7 +174,8 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
         for (int j = 0; j < P::AccColsPerTh; ++j) {
-          acc[i][j] = raft::mySqrt(acc[i][j]);
+          auto acc_ij = acc[i][j];
+          acc[i][j]   = acc_ij > DataT{0} ? raft::mySqrt(acc_ij) : DataT{0};
         }
       }
     }
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index 54ac490ca4..344dda693e 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -120,7 +120,7 @@ class GramMatrixBase {
    *
    * @param [in] x1 device array of vectors, size [n1*n_cols]
    * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of colums (features) in x1 and x2
+   * @param [in] n_cols number of columns (features) in x1 and x2
    * @param [in] x2 device array of vectors, size [n2*n_cols]
    * @param [in] n2 number vectors in x2
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index 6d59e1c7c5..b74de84d80 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -47,7 +47,7 @@ __global__ void polynomial_kernel_nopad(
  * @param inout device vector in column major format, size [ld * cols]
  * @param ld leading dimension of the inout buffer
  * @param rows number of rows (rows <= ld)
- * @param cols number of colums
+ * @param cols number of columns
  * @param exponent
  * @param gain
  * @param offset
@@ -85,7 +85,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t
  * @param inout device vector in column major format, size [ld * cols]
  * @param ld leading dimension of the inout buffer
  * @param rows number of rows (rows <= ld)
- * @param cols number of colums
+ * @param cols number of columns
  * @param gain
  * @param offset
  */
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 27e9935358..26536d13cd 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -364,6 +364,91 @@ __global__ __launch_bounds__(Policy::Nthreads, 2)
   obj.run();
 }
 
+/**
+ * @brief the distance matrix calculation kernel for L2 and cosine
+ * for GPU arch < SM 8.0, this version is to make sure we don't recompile
+ * these kernels for ampere or higher as we use cutlass kernel for it.
+ * @tparam useNorms       whether norms are needed
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Policy         struct which tunes the Contraction kernel
+ * @tparam CoreLambda     lambda which implements accumulation operation
+ * @tparam EpilogueLambda lambda which implements operation for calculating
+                          final value.
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major(default),
+                          false for column major
+ *
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       xn row norms of input matrix A.
+ * @param[in]       yn row norms of input matrix B.
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of columns of B and C/D
+ * @param[in]       k number of cols of A and rows of B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   pD output matrix
+ * @param core_op   the core lambda
+ * @param epilog_op the epilogue lambda
+ * @param fin_op    the final gemm epilogue lambda
+ */
+
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          bool isRowMajor = true,
+          bool writeOut   = true>
+__global__ __launch_bounds__(Policy::Nthreads, 2)
+
+  void pairwiseDistanceMatKernelPriorToAmpere(const DataT* x,
+                                              const DataT* y,
+                                              const DataT* _xn,
+                                              const DataT* _yn,
+                                              IdxT m,
+                                              IdxT n,
+                                              IdxT k,
+                                              IdxT lda,
+                                              IdxT ldb,
+                                              IdxT ldd,
+                                              OutT* dOutput,
+                                              CoreLambda core_op,
+                                              EpilogueLambda epilog_op,
+                                              FinalLambda fin_op)
+{
+  //#if __CUDA_ARCH__ < 800
+  // TODO: re-enable the CUDA_ARCH guard for below Ampere once cutlass based
+  //  kernels are enabled for CUDA 12.0
+  extern __shared__ char smem[];
+  auto rowEpilog = [] __device__(IdxT starty) { return; };
+
+  PairwiseDistances<useNorms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    CoreLambda,
+                    EpilogueLambda,
+                    FinalLambda,
+                    decltype(rowEpilog),
+                    isRowMajor,
+                    writeOut>
+    obj(
+      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
+  obj.run();
+  //#endif
+}
+
 template <typename P, typename IdxT, typename T>
 dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
 {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
new file mode 100644
index 0000000000..f39d880da4
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
+#if (__CUDACC_VER_MAJOR__ < 12)
+
+// We define CUTLASS_NAMESPACE in case
+// RAFT cmake is not used
+#ifndef CUTLASS_NAMESPACE
+#define cutlass raft_cutlass
+#endif
+
+#include <rmm/device_uvector.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+
+#include <cutlass/layout/matrix.h>
+#include <cutlass/layout/tensor.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/tensor_view.h>
+
+#include "./pairwise_distance_epilogue_elementwise.h"
+#include "./pairwise_distance_gemm.h"
+
+#define CUTLASS_CHECK(status)                                                                    \
+  {                                                                                              \
+    cutlass::Status error = status;                                                              \
+    if (error != cutlass::Status::kSuccess) {                                                    \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                << std::endl;                                                                    \
+      exit(EXIT_FAILURE);                                                                        \
+    }                                                                                            \
+  }
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          typename DistanceFn,
+          bool isRowMajor>
+void cutlassDistanceKernel(const DataT* x,
+                           const DataT* y,
+                           const DataT* xn,
+                           const DataT* yn,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           IdxT lda,
+                           IdxT ldb,
+                           IdxT ldd,
+                           OutT* dOutput,
+                           FinalLambda fin_op,
+                           DistanceFn dist_op,
+                           cudaStream_t stream)
+{
+  static_assert(!(std::is_same<OutT, bool>::value),
+                "OutType bool is not supported use uint8_t instead");
+
+  using EpilogueOutputOp =
+    cutlass::epilogue::thread::PairwiseDistanceEpilogueElementwise<DataT,  // ElementC_
+                                                                   AccT,   // ElementAccumulator_
+                                                                   DataT,  // ElementCompute_
+                                                                   AccT,   // ElementZ_
+                                                                   OutT,   // ElementT_
+                                                                   1,      // Elements per access 1
+                                                                   DistanceFn,
+                                                                   FinalLambda>;
+  constexpr int batch_count = 1;
+
+  constexpr auto mode = cutlass::gemm::GemmUniversalMode::kGemm;
+
+  typename EpilogueOutputOp::Params epilog_op_param(dist_op, fin_op);
+
+  const DataT *a, *b;
+
+  IdxT gemm_lda, gemm_ldb;
+
+  // Number of pipelines you want to use
+  constexpr int NumStages = 3;
+  // Alignment
+  constexpr int Alignment = VecLen;
+
+  // default initialize problem size with row major inputs
+  auto problem_size = cutlass::gemm::GemmCoord(n, m, k);
+
+  using cutlassDistKernel =
+    typename cutlass::gemm::kernel::PairwiseDistanceGemm<DataT,
+                                                         Alignment,
+                                                         DataT,
+                                                         Alignment,
+                                                         AccT,
+                                                         AccT,
+                                                         EpilogueOutputOp,
+                                                         NumStages,  // Number of pipeline stages
+                                                         isRowMajor>::GemmKernel;
+
+  using cutlassDist = cutlass::gemm::device::GemmUniversalAdapter<cutlassDistKernel>;
+
+  if constexpr (isRowMajor) {
+    a        = y;
+    b        = x;
+    gemm_lda = ldb;
+    gemm_ldb = lda;
+  } else {
+    problem_size = cutlass::gemm::GemmCoord(m, n, k);
+    a            = x;
+    b            = y;
+    gemm_lda     = lda;
+    gemm_ldb     = ldb;
+  }
+
+  typename cutlassDist::Arguments arguments{
+    mode,       problem_size, batch_count, epilog_op_param, a, b,
+    xn,          // C matrix eq vector param, which here is A norm
+    nullptr,     // tensor_Z,
+    (DataT*)yn,  // this is broadcast vec, which is required to be non-const param
+    dOutput,     // Output distance matrix
+    (int64_t)0,  // batch stride A
+    (int64_t)0,  // batch stride B
+    (int64_t)0,  // batch stride Norm A
+    (int64_t)0,
+    (int64_t)0,  // batch stride Norm B
+    (int64_t)0,  // batch stride Output
+    gemm_lda,    // stride A
+    gemm_ldb,    // stride B
+    1,           // stride A norm
+    0,           // this is no-op for Z
+    0,           // This must be zero
+    ldd          // stride Output matrix
+  };
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = cutlassDist::get_workspace_size(arguments);
+  // Allocate workspace memory
+  rmm::device_uvector<uint8_t> workspace(workspace_size, stream);
+  // Instantiate CUTLASS kernel depending on templates
+  cutlassDist cutlassDist_op;
+  // Check the problem size is supported or not
+  cutlass::Status status = cutlassDist_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = cutlassDist_op.initialize(arguments, workspace.data(), stream);
+  CUTLASS_CHECK(status);
+  // Launch initialized CUTLASS kernel
+  status = cutlassDist_op();
+  CUTLASS_CHECK(status);
+}
+
+};      // namespace detail
+};      // namespace distance
+};      // namespace raft
+#endif  //  (__CUDACC_VER_MAJOR__ < 12)
+#pragma GCC diagnostic pop
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_epilogue.h b/cpp/include/raft/distance/detail/pairwise_distance_epilogue.h
new file mode 100644
index 0000000000..21e7d18854
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_distance_epilogue.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+This is adapted from DefaultEpilogueWithBroadcastTensorOp from CUTLASS 2.9.0
+(https://github.com/NVIDIA/cutlass/blob/master/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h#L75)
+
+This epilogue allows us to load norm buffers using PredicatedTileIteratorNormVec
+and EpilogueWithBroadcast used for distances L2/cosine as well as applies user-define elementwise
+operation.
+-- A norm load is provided PredicatedTileIteratorNormVec
+-- B norm load is provided by EpilogueWithBroadcast
+-- elementwise operation is provided by OutputOp
+*/
+
+#pragma once
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+
+#include <cutlass/gemm/gemm.h>
+
+#include "./predicated_tile_iterator_normvec.h"
+#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
+#include <cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h>
+#include <cutlass/epilogue/threadblock/epilogue.h>
+#include <cutlass/epilogue/threadblock/epilogue_with_broadcast.h>
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <typename Shape,
+          typename WarpMmaTensorOp,
+          int PartitionsK,
+          typename ElementOutput,
+          typename ElementTensor,
+          typename ElementVector,
+          typename OutputOp,
+          typename LayoutT,
+          int ElementsPerAccess,
+          bool ScatterD = false>
+struct PairwiseDistanceEpilogue {
+  /// Use defaults related to the existing epilogue
+  using Base =
+    DefaultEpilogueTensorOp<Shape, WarpMmaTensorOp, PartitionsK, OutputOp, ElementsPerAccess>;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::
+    PredicatedTileIteratorNormVec<typename Base::OutputTileThreadMap, ElementOutput, LayoutT>;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator =
+    cutlass::epilogue::threadblock::PredicatedTileIterator<typename Base::OutputTileThreadMap,
+                                                           ElementTensor>;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<Shape,
+                                         WarpMmaTensorOp,
+                                         PartitionsK,
+                                         OutputTileIterator,
+                                         TensorTileIterator,
+                                         ElementVector,
+                                         typename Base::AccumulatorFragmentIterator,
+                                         typename Base::WarpTileIterator,
+                                         typename Base::SharedLoadIterator,
+                                         OutputOp,
+                                         typename Base::Padding,
+                                         Base::kFragmentsPerIteration>;
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/include/raft/distance/detail/pairwise_distance_epilogue_elementwise.h
new file mode 100644
index 0000000000..3e33f4d833
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_distance_epilogue_elementwise.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+/*! \file
+  \brief Functor performing distance operations used by epilogues of pairwise distance
+  * kernels.
+* This is adapted from LinearCombinationBiasElementwise from CUTLASS 2.9.0
+* customized for applying elementwise distance formula on accumulated GEMM value
+* and applying user-defined final custom operation on the distance value.
+*/
+
+#pragma once
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/functional.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+
+#include <cutlass/epilogue/thread/activation.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <typename ElementC_,
+          typename ElementAccumulator_,
+          typename ElementCompute_,
+          typename ElementZ_,
+          typename ElementT_,
+          int ElementsPerAccess,
+          typename DistanceOp_,
+          typename FinalOp_>
+class PairwiseDistanceEpilogueElementwise {
+ public:
+  using ElementOutput                 = ElementC_;
+  using ElementC                      = ElementC_;
+  using ElementAccumulator            = ElementAccumulator_;
+  using ElementCompute                = ElementCompute_;
+  using ElementZ                      = ElementZ_;
+  using ElementT                      = ElementT_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount             = kElementsPerAccess;
+
+  using DistanceOp = DistanceOp_;
+  using FinalOp    = FinalOp_;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute     = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC           = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ           = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT           = Array<ElementT, kElementsPerAccess>;
+
+  using FragmentOutput = FragmentZ;
+
+  static bool const kIsHeavy = false;  // ElementwiseOp::kIsHeavy;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = false;  // We don't store anything in Z,
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = true;  // this is our final output storage.
+
+  /// Host-constructable parameters structure
+  struct Params {
+    FinalOp_ final_op_;
+    DistanceOp_ dist_op_;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params(DistanceOp_ dist_op, FinalOp final_op) : final_op_(final_op), dist_op_(dist_op) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+  FinalOp_ final_op;
+  DistanceOp_ elementwise_op;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  PairwiseDistanceEpilogueElementwise(Params const& params)
+    : final_op(params.final_op_), elementwise_op(params.dist_op_)
+  {
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const
+  {
+    // we use for making sure C matrix path is used for A mat norm.
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentZ& frag_Z,
+                  FragmentT& frag_T,
+                  FragmentAccumulator const& AB,
+                  FragmentC const& frag_C,
+                  FragmentCompute const& V) const
+  {
+    FragmentCompute tmp_Accum =
+      NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C =
+      NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      result_Z[i] = elementwise_op(tmp_C[i], V[i], tmp_Accum[i]);
+      result_T[i] = final_op(result_Z[i], 0);
+    }
+
+    NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+    frag_T = convert_t(result_T);
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentZ& frag_Z,
+                  FragmentT& frag_T,
+                  FragmentAccumulator const& AB,
+                  FragmentCompute const& V) const
+  {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace thread
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
new file mode 100644
index 0000000000..ea9ed77fb5
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+
+#include <cutlass/gemm/kernel/default_gemm_universal.h>
+#include <cutlass/gemm/kernel/gemm_with_fused_epilogue.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/layout/tensor.h>
+
+#include "./pairwise_distance_epilogue.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Element type for final output
+  // typename ElementOutT,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+  typename EpilogueOutputOp,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// data layout row/column major of inputs
+  bool isRowMajor>
+struct PairwiseDistanceGemm {
+  // This struct is specialized for fp32/3xTF32
+
+  /// Threadblock-level tile size (concept: GemmShape)
+  using ThreadblockShape =
+    cutlass::gemm::GemmShape<128, 128, 16>;  // <- threadblock tile M = 128, N = 128, K = 16
+  /// Warp-level tile size (concept: GemmShape)
+  // This code section describes tile size a warp will compute
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 16>;  // <- warp tile M = 64, N = 64, K = 16
+  /// Warp-level tile size (concept: GemmShape)
+  // This code section describes the size of MMA op
+  using InstructionShape =
+    cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
+
+  /// Operation performed by GEMM
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  // This code section describes CUDA SM architecture number
+  using ArchTag = cutlass::arch::Sm80;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  /// Threadblock-level swizzling operator
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  /// data layout for final output matrix.
+  // we keep this same layout even for column major inputs
+  using LayoutOutput = cutlass::layout::RowMajor;
+
+  typedef typename std::conditional<isRowMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor>::type NormXLayout;
+
+  typedef typename std::
+    conditional<isRowMajor, cutlass::layout::RowMajor, cutlass::layout::ColumnMajor>::type LayoutA_;
+
+  typedef typename std::
+    conditional<isRowMajor, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>::type LayoutB_;
+
+  using GemmBase = typename DefaultGemmUniversal<ElementA_,
+                                                 LayoutA_,
+                                                 cutlass::ComplexTransform::kNone,
+                                                 kAlignmentA,
+                                                 ElementB_,
+                                                 LayoutB_,
+                                                 cutlass::ComplexTransform::kNone,
+                                                 kAlignmentB,
+                                                 ElementC_,
+                                                 LayoutOutput,
+                                                 ElementAccumulator,
+                                                 OperatorClass,
+                                                 ArchTag,
+                                                 ThreadblockShape,
+                                                 WarpShape,
+                                                 InstructionShape,
+                                                 EpilogueOutputOp,
+                                                 ThreadblockSwizzle,
+                                                 Stages,
+                                                 Operator>::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::PairwiseDistanceEpilogue<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementAccumulator,
+    typename EpilogueOutputOp::ElementT,
+    ElementAccumulator,
+    EpilogueOutputOp,
+    NormXLayout,
+    GemmBase::Epilogue::kElementsPerAccess>::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+template <
+  /// Layout type for A matrix operand
+  int kAlignmentA,
+  /// Layout type for B matrix operand
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+  typename EpilogueOutputOp,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// data layout row/column major of inputs
+  bool isRowMajor>
+struct PairwiseDistanceGemm<double,
+                            kAlignmentA,
+                            double,
+                            kAlignmentB,
+                            ElementC_,
+                            ElementAccumulator,
+                            EpilogueOutputOp,
+                            Stages,
+                            isRowMajor> {
+  // using Transform = cutlass::ComplexTransform::kNone;
+  // Threadblock-level tile size (concept: GemmShape)
+  using ThreadblockShape =
+    cutlass::gemm::GemmShape<64, 64, 16>;  // <- threadblock tile M = 64, N = 64, K = 16
+  /// Warp-level tile size (concept: GemmShape)
+  // This code section describes tile size a warp will compute
+  using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>;  // <- warp tile M = 32, N = 32, K = 16
+  /// Warp-level tile size (concept: GemmShape)
+  // This code section describes the size of MMA op
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+  // Operation performed by GEMM
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU
+  // SM
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  // This code section describes CUDA SM architecture number
+  using ArchTag = cutlass::arch::Sm80;
+
+  // This code section describes how threadblocks are scheduled on GPU
+  /// Threadblock-level swizzling operator
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  /// data layout for final output matrix.
+  // we keep this same layout even for column major inputs
+  using LayoutOutput = cutlass::layout::RowMajor;
+
+  typedef typename std::conditional<isRowMajor,
+                                    cutlass::layout::RowMajor,
+                                    cutlass::layout::ColumnMajor>::type NormXLayout;
+
+  typedef typename std::
+    conditional<isRowMajor, cutlass::layout::RowMajor, cutlass::layout::ColumnMajor>::type LayoutA_;
+
+  typedef typename std::
+    conditional<isRowMajor, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>::type LayoutB_;
+
+  using GemmBase = typename DefaultGemmUniversal<double,
+                                                 LayoutA_,
+                                                 cutlass::ComplexTransform::kNone,
+                                                 1,
+                                                 double,
+                                                 LayoutB_,
+                                                 cutlass::ComplexTransform::kNone,
+                                                 1,
+                                                 ElementC_,
+                                                 LayoutOutput,
+                                                 ElementAccumulator,
+                                                 OperatorClass,
+                                                 ArchTag,
+                                                 ThreadblockShape,
+                                                 WarpShape,
+                                                 InstructionShape,
+                                                 EpilogueOutputOp,
+                                                 ThreadblockSwizzle,
+                                                 Stages,
+                                                 Operator>::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::PairwiseDistanceEpilogue<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    ElementC_,
+    EpilogueOutputOp,
+    NormXLayout,
+    GemmBase::Epilogue::kElementsPerAccess>::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
\ No newline at end of file
diff --git a/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
new file mode 100644
index 0000000000..67c01448dc
--- /dev/null
+++ b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
@@ -0,0 +1,581 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+This file contains a customized version of PredicatedTileIterator from CUTLASS 2.9.0
+(https://github.com/NVIDIA/cutlass/blob/v2.9.0/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h#L75)
+
+Changes:
+- added `Layout_` template param
+- Only the row index is used to load the data in load_with_byte_offset().
+  This way the same normalization data is used across all columns in a row.
+
+*/
+
+#pragma once
+
+#include <cutlass/arch/arch.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/threadblock/output_tile_thread_map.h>
+#include <cutlass/epilogue/threadblock/predicated_tile_iterator_params.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/layout/tensor.h>
+#include <cutlass/matrix_shape.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tensor_ref.h>
+#include <cutlass/transform/pitch_linear_thread_map.h>
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
+          typename Layout_,
+          bool ScatterD     = false,  ///< Scatter D operand or not
+          bool UseCUDAStore = false>
+class PredicatedTileIteratorNormVec {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape     = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout         = Layout_;
+  using TensorRef      = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index       = typename Layout::Index;
+  using LongIndex   = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads           = ThreadMap::kThreads;
+  static int const kIterations        = ThreadMap::Count::kTile;
+
+  static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(ThreadMap::Iterations::kCluster > 0, "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<Element,
+                         ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+                           ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+                           ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+      : PredicatedTileIteratorParams(
+          layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+          make_OutputTileThreadMapDesc<ThreadMap>())
+    {
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() { enable(); }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear()
+    {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable()
+    {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorNormVec(PredicatedTileIteratorParams const& params,
+                                Element* pointer,
+                                TensorCoord extent,
+                                int thread_idx,
+                                TensorCoord threadblock_offset = TensorCoord(),
+                                int const* indices             = nullptr)
+    : params_(params), indices_(indices)
+  {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_    = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_    = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+        ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) { mask_.clear(); }
+
+    if (ScatterD && !indices) { mask_.clear(); }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+                    LongIndex(thread_offset.row()) * LongIndex(params_.stride);
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+                      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset)
+  {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const
+  {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr  = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup +
+                           cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer = reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+              byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+              (void*)&memory_pointer[0],
+              guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) { byte_pointer += params_.increment_row; }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const
+  {
+    uint8_t* byte_pointer      = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup +
+                           cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer = reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+              byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) { byte_pointer += params_.increment_row; }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const { store_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(Fragment& frag,
+                                        int64_t byte_offset,
+                                        int convolution_P,
+                                        int convolution_Q,
+                                        int add_P,
+                                        int add_Q,
+                                        int problem_N) const
+  {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr  = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup +
+                           cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N   = output_row / (convolution_P * convolution_Q);
+          int output_PQ  = output_row % (convolution_P * convolution_Q);
+          int output_P   = output_PQ / convolution_Q;
+          int output_Q   = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+                          (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer = reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+              (void*)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+              guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) { byte_pointer += params_.increment_row; }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(Fragment& frag,
+                                      int64_t byte_offset,
+                                      int convolution_P,
+                                      int convolution_Q,
+                                      int add_P,
+                                      int add_Q,
+                                      int problem_N) const
+  {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr  = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup +
+                           cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N   = output_row / (convolution_P * convolution_Q);
+          int output_PQ  = output_row % (convolution_P * convolution_Q);
+          int output_P   = output_PQ / convolution_Q;
+          int output_Q   = output_PQ % convolution_Q;
+          int row_add_P  = add_P;
+          int row_add_Q  = add_Q;
+          if (output_P > convolution_P - 2) row_add_P = 0;
+          if (output_Q > convolution_Q - 2) row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+                          ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+                          (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer = reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+              (void*)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+              guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) { byte_pointer += params_.increment_row; }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const { return MatrixCoord(thread_start_row_, thread_start_column_); }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const { return thread_start_row_; }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const { return thread_start_column_; }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const { return extent_row_; }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const { return extent_column_; }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorNormVec& operator++()
+  {
+    ++state_[0];
+
+    if (!ScatterD) { byte_pointer_ += params_.advance_row; }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup *
+                             ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() { mask_.clear(); }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() { mask_.enable(); }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index fb4fb8d34c..ef51a54622 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -107,7 +107,9 @@ void fusedL2NN(OutT* min,
   bool is_skinny = k < 32;
 
   size_t bytes = sizeof(DataT) * k;
-  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+  auto px      = reinterpret_cast<uintptr_t>(x);
+  auto py      = reinterpret_cast<uintptr_t>(y);
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0 && px % 16 == 0 && py % 16 == 0) {
     if (is_skinny) {
       detail::fusedL2NNImpl<DataT,
                             OutT,
@@ -123,7 +125,7 @@ void fusedL2NN(OutT* min,
                             ReduceOpT>(
         min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
     }
-  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0 && px % 8 == 0 && py % 8 == 0) {
     if (is_skinny) {
       detail::fusedL2NNImpl<DataT,
                             OutT,
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 9f1d5d4a33..37956fe762 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -74,7 +74,7 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st
   detail::add(out, in1, in2, len, stream);
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+/** Subtract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
  * write result to outDev[i]
  * @tparam InT     input data-type. Also the data-type upon which the math ops
  *                 will be performed
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index f40866b235..af8d12d873 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -42,14 +42,14 @@ namespace linalg {
  * On exit L contains the Cholesky decomposition of A'. In practice the elements
  * of A_new are overwritten with new row/column of the L matrix.
  *
- * The uplo paramater is used to select the matrix layout.
+ * The uplo parameter is used to select the matrix layout.
  * If (uplo != CUBLAS_FILL_MODE_UPPER) then the input arg L stores the
  * lower triangular matrix L, so that A = L * L.T. Otherwise the input arg L
  * stores an upper triangular matrix U: A = U.T * U.
  *
  * On exit L will be updated to store the Cholesky decomposition of A'.
  *
- * If the matrix is not positive definit, or very ill conditioned then the new
+ * If the matrix is not positive definite, or very ill conditioned then the new
  * diagonal element of L would be NaN. In such a case an exception is thrown.
  * The eps argument can be used to override this behavior: if eps >= 0 then
  * the diagonal element is replaced by eps in case the diagonal is NaN or
@@ -106,7 +106,7 @@ namespace linalg {
  * // Now U stores the Cholesky decomposition of A: A = U.T * U
  * @endcode
  *
- * @param handle RAFT handle (used to retrive cuBLAS handles).
+ * @param handle RAFT handle (used to retrieve cuBLAS handles).
  * @param L device array for to store the triangular matrix L, and the new
  *     column of A in column major format, size [n*n]
  * @param n number of elements in the new row.
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 6ef0d52e62..e9e5a99f46 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -112,21 +112,21 @@ void coalescedReduction(OutType* dots,
 template <typename InValueType,
           typename LayoutPolicy,
           typename OutValueType,
-          typename IndexType,
-          typename MainLambda   = raft::Nop<InValueType>,
+          typename IdxType,
+          typename MainLambda   = raft::Nop<InValueType, IdxType>,
           typename ReduceLambda = raft::Sum<OutValueType>,
           typename FinalLambda  = raft::Nop<OutValueType>>
 void coalesced_reduction(const raft::handle_t& handle,
-                         raft::device_matrix_view<const InValueType, IndexType, LayoutPolicy> data,
-                         raft::device_vector_view<OutValueType, IndexType> dots,
+                         raft::device_matrix_view<const InValueType, IdxType, LayoutPolicy> data,
+                         raft::device_vector_view<OutValueType, IdxType> dots,
                          OutValueType init,
                          bool inplace           = false,
-                         MainLambda main_op     = raft::Nop<InValueType>(),
+                         MainLambda main_op     = raft::Nop<InValueType, IdxType>(),
                          ReduceLambda reduce_op = raft::Sum<OutValueType>(),
                          FinalLambda final_op   = raft::Nop<OutValueType>())
 {
   if constexpr (std::is_same_v<LayoutPolicy, raft::row_major>) {
-    RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(0),
+    RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(0),
                  "Output should be equal to number of rows in Input");
 
     coalescedReduction(dots.data_handle(),
@@ -140,7 +140,7 @@ void coalesced_reduction(const raft::handle_t& handle,
                        reduce_op,
                        final_op);
   } else if constexpr (std::is_same_v<LayoutPolicy, raft::col_major>) {
-    RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(1),
+    RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(1),
                  "Output should be equal to number of columns in Input");
 
     coalescedReduction(dots.data_handle(),
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index a1d6ebbe6e..47937815bd 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -44,7 +44,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
   // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
   // - A_22 = A_new[n-1] scalar.
   //
-  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
+  // Instead of calculating the Cholelsky decomposition of A' from scratch,
   // we just update L with the new row. The new Cholesky decomposition will be
   // calculated as:
   // L' = [[L_11,    0]
@@ -114,7 +114,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
   handle.sync_stream(stream);
   L_22_host = std::sqrt(L_22_host - s_host);
 
-  // Check for numeric error with sqrt. If the matrix is not positive definit or
+  // Check for numeric error with sqrt. If the matrix is not positive definite or
   // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
   // negative, which would result L_22 = NaN. A small positive eps parameter
   // can be used to prevent this.
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index cf1b8cf5a5..63351f5475 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -17,34 +17,136 @@
 #pragma once
 
 #include <cub/cub.cuh>
+#include <raft/common/nvtx.hpp>
 #include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
 namespace detail {
 
-// Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
-// of the matrix, i.e. reduce along rows for row major or reduce along columns
-// for column major layout. Kernel does an inplace reduction adding to original
-// values of dots.
+template <int warpSize, int rpb>
+struct ReductionThinPolicy {
+  static constexpr int LogicalWarpSize = warpSize;
+  static constexpr int RowsPerBlock    = rpb;
+  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+};
+
+template <typename Policy,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  coalescedReductionThinKernel(OutType* dots,
+                               const InType* data,
+                               IdxType D,
+                               IdxType N,
+                               OutType init,
+                               MainLambda main_op,
+                               ReduceLambda reduce_op,
+                               FinalLambda final_op,
+                               bool inplace = false)
+{
+  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i >= N) return;
+
+  OutType acc = init;
+  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+    acc = reduce_op(acc, main_op(data[j + (D * i)], j));
+  }
+  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[i] = final_op(reduce_op(dots[i], acc));
+    } else {
+      dots[i] = final_op(acc);
+    }
+  }
+}
+
+template <typename Policy,
+          typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReductionThin(OutType* dots,
+                            const InType* data,
+                            IdxType D,
+                            IdxType N,
+                            OutType init,
+                            cudaStream_t stream,
+                            bool inplace           = false,
+                            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                            ReduceLambda reduce_op = raft::Sum<OutType>(),
+                            FinalLambda final_op   = raft::Nop<OutType>())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
+  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+  dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
+  coalescedReductionThinKernel<Policy>
+    <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReductionThinDispatcher(OutType* dots,
+                                      const InType* data,
+                                      IdxType D,
+                                      IdxType N,
+                                      OutType init,
+                                      cudaStream_t stream,
+                                      bool inplace           = false,
+                                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                                      FinalLambda final_op   = raft::Nop<OutType>())
+{
+  if (D <= IdxType(2)) {
+    coalescedReductionThin<ReductionThinPolicy<2, 64>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(4)) {
+    coalescedReductionThin<ReductionThinPolicy<4, 32>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(8)) {
+    coalescedReductionThin<ReductionThinPolicy<8, 16>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(16)) {
+    coalescedReductionThin<ReductionThinPolicy<16, 8>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    coalescedReductionThin<ReductionThinPolicy<32, 4>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+template <int TPB,
+          typename InType,
           typename OutType,
           typename IdxType,
-          int TPB,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-__global__ void coalescedReductionKernel(OutType* dots,
-                                         const InType* data,
-                                         int D,
-                                         int N,
-                                         OutType init,
-                                         MainLambda main_op,
-                                         ReduceLambda reduce_op,
-                                         FinalLambda final_op,
-                                         bool inplace = false)
+__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
+                                                                      const InType* data,
+                                                                      IdxType D,
+                                                                      IdxType N,
+                                                                      OutType init,
+                                                                      MainLambda main_op,
+                                                                      ReduceLambda reduce_op,
+                                                                      FinalLambda final_op,
+                                                                      bool inplace = false)
 {
-  typedef cub::BlockReduce<OutType, TPB> BlockReduce;
+  typedef cub::BlockReduce<OutType, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType thread_data = init;
   IdxType rowStart    = blockIdx.x * D;
@@ -62,6 +164,169 @@ __global__ void coalescedReductionKernel(OutType* dots,
   }
 }
 
+template <int TPB,
+          typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReductionMedium(OutType* dots,
+                              const InType* data,
+                              IdxType D,
+                              IdxType N,
+                              OutType init,
+                              cudaStream_t stream,
+                              bool inplace           = false,
+                              MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                              ReduceLambda reduce_op = raft::Sum<OutType>(),
+                              FinalLambda final_op   = raft::Nop<OutType>())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("coalescedReductionMedium<%d>", TPB);
+  coalescedReductionMediumKernel<TPB>
+    <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReductionMediumDispatcher(OutType* dots,
+                                        const InType* data,
+                                        IdxType D,
+                                        IdxType N,
+                                        OutType init,
+                                        cudaStream_t stream,
+                                        bool inplace           = false,
+                                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                                        FinalLambda final_op   = raft::Nop<OutType>())
+{
+  // Note: for now, this kernel is only used when D > 256. If this changes in the future, use
+  // smaller block sizes when relevant.
+  coalescedReductionMedium<256>(
+    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+template <int tpb, int bpr>
+struct ReductionThickPolicy {
+  static constexpr int ThreadsPerBlock = tpb;
+  static constexpr int BlocksPerRow    = bpr;
+  static constexpr int BlockStride     = tpb * bpr;
+};
+
+template <typename Policy,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  coalescedReductionThickKernel(OutType* buffer,
+                                const InType* data,
+                                IdxType D,
+                                IdxType N,
+                                OutType init,
+                                MainLambda main_op,
+                                ReduceLambda reduce_op)
+{
+  typedef cub::BlockReduce<OutType, Policy::ThreadsPerBlock, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType thread_data = init;
+  IdxType rowStart    = blockIdx.x * D;
+  for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D;
+       i += Policy::BlockStride) {
+    IdxType idx = rowStart + i;
+    thread_data = reduce_op(thread_data, main_op(data[idx], i));
+  }
+  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
+  if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; }
+}
+
+template <typename ThickPolicy,
+          typename ThinPolicy,
+          typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReductionThick(OutType* dots,
+                             const InType* data,
+                             IdxType D,
+                             IdxType N,
+                             OutType init,
+                             cudaStream_t stream,
+                             bool inplace           = false,
+                             MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                             ReduceLambda reduce_op = raft::Sum<OutType>(),
+                             FinalLambda final_op   = raft::Nop<OutType>())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "coalescedReductionThick<%d,%d>", ThickPolicy::ThreadsPerBlock, ThickPolicy::BlocksPerRow);
+
+  dim3 threads(ThickPolicy::ThreadsPerBlock, 1, 1);
+  dim3 blocks(N, ThickPolicy::BlocksPerRow, 1);
+
+  rmm::device_uvector<OutType> buffer(N * ThickPolicy::BlocksPerRow, stream);
+
+  /* We apply a two-step reduction:
+   *  1. coalescedReductionThickKernel reduces the [N x D] input data to [N x BlocksPerRow]. It
+   *     applies the main_op but not the final op.
+   *  2. coalescedReductionThinKernel reduces [N x BlocksPerRow] to [N x 1]. It doesn't apply any
+   *     main_op but applies final_op. If in-place, the existing and new values are reduced.
+   */
+
+  coalescedReductionThickKernel<ThickPolicy>
+    <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  coalescedReductionThin<ThinPolicy>(dots,
+                                     buffer.data(),
+                                     static_cast<IdxType>(ThickPolicy::BlocksPerRow),
+                                     N,
+                                     init,
+                                     stream,
+                                     inplace,
+                                     raft::Nop<OutType, IdxType>(),
+                                     reduce_op,
+                                     final_op);
+}
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReductionThickDispatcher(OutType* dots,
+                                       const InType* data,
+                                       IdxType D,
+                                       IdxType N,
+                                       OutType init,
+                                       cudaStream_t stream,
+                                       bool inplace           = false,
+                                       MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                                       ReduceLambda reduce_op = raft::Sum<OutType>(),
+                                       FinalLambda final_op   = raft::Nop<OutType>())
+{
+  // Note: multiple elements per thread to take advantage of the sequential reduction and loop
+  // unrolling
+  if (D < IdxType(32768)) {
+    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 4>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 4>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+// Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
+// rows for row major or reduce along columns for column major layout. Can do an inplace reduction
+// adding to original values of dots if requested.
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
@@ -70,8 +335,8 @@ template <typename InType,
           typename FinalLambda  = raft::Nop<OutType>>
 void coalescedReduction(OutType* dots,
                         const InType* data,
-                        int D,
-                        int N,
+                        IdxType D,
+                        IdxType N,
                         OutType init,
                         cudaStream_t stream,
                         bool inplace           = false,
@@ -79,22 +344,22 @@ void coalescedReduction(OutType* dots,
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
                         FinalLambda final_op   = raft::Nop<OutType>())
 {
-  // One block per reduction
-  // Efficient only for large leading dimensions
-  if (D <= 32) {
-    coalescedReductionKernel<InType, OutType, IdxType, 32>
-      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
-  } else if (D <= 64) {
-    coalescedReductionKernel<InType, OutType, IdxType, 64>
-      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
-  } else if (D <= 128) {
-    coalescedReductionKernel<InType, OutType, IdxType, 128>
-      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  /* The primitive selects one of three implementations based on heuristics:
+   *  - Thin: very efficient when D is small and/or N is large
+   *  - Thick: used when N is very small and D very large
+   *  - Medium: used when N is too small to fill the GPU with the thin kernel
+   */
+  const IdxType numSMs = raft::getMultiProcessorCount();
+  if (D <= IdxType(256) || N >= IdxType(4) * numSMs) {
+    coalescedReductionThinDispatcher(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (N < numSMs && D >= IdxType(16384)) {
+    coalescedReductionThickDispatcher(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReductionKernel<InType, OutType, IdxType, 256>
-      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+    coalescedReductionMediumDispatcher(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh
index a0b557211c..f2f08233d5 100644
--- a/cpp/include/raft/linalg/detail/norm.cuh
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -16,15 +16,13 @@
 
 #pragma once
 
+#include <raft/linalg/norm_types.hpp>
 #include <raft/linalg/reduce.cuh>
 
 namespace raft {
 namespace linalg {
 namespace detail {
 
-/** different types of norms supported on the input buffers */
-enum NormType { L1Norm = 0, L2Norm };
-
 template <typename Type, typename IdxType, typename Lambda>
 void rowNormCaller(Type* dots,
                    const Type* data,
@@ -64,7 +62,21 @@ void rowNormCaller(Type* dots,
                                                 raft::Sum<Type>(),
                                                 fin_op);
       break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+    case LinfNorm:
+      raft::linalg::reduce<Type, Type, IdxType>(dots,
+                                                data,
+                                                D,
+                                                N,
+                                                (Type)0,
+                                                rowMajor,
+                                                true,
+                                                stream,
+                                                false,
+                                                raft::L1Op<Type>(),
+                                                raft::Max<Type>(),
+                                                fin_op);
+      break;
+    default: THROW("Unsupported norm type: %d", type);
   };
 }
 
@@ -89,7 +101,7 @@ void colNormCaller(Type* dots,
                                                 false,
                                                 stream,
                                                 false,
-                                                raft::L1Op<Type, IdxType>(),
+                                                raft::L1Op<Type>(),
                                                 raft::Sum<Type>(),
                                                 fin_op);
       break;
@@ -103,11 +115,25 @@ void colNormCaller(Type* dots,
                                                 false,
                                                 stream,
                                                 false,
-                                                raft::L2Op<Type, IdxType>(),
+                                                raft::L2Op<Type>(),
                                                 raft::Sum<Type>(),
                                                 fin_op);
       break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+    case LinfNorm:
+      raft::linalg::reduce<Type, Type, IdxType>(dots,
+                                                data,
+                                                D,
+                                                N,
+                                                (Type)0,
+                                                rowMajor,
+                                                false,
+                                                stream,
+                                                false,
+                                                raft::L1Op<Type>(),
+                                                raft::Max<Type>(),
+                                                fin_op);
+      break;
+    default: THROW("Unsupported norm type: %d", type);
   };
 }
 
diff --git a/cpp/include/raft/linalg/detail/normalize.cuh b/cpp/include/raft/linalg/detail/normalize.cuh
new file mode 100644
index 0000000000..78c773ab35
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/normalize.cuh
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <int warpSize, int rpb>
+struct NormalizeThinPolicy {
+  static constexpr int LogicalWarpSize = warpSize;
+  static constexpr int RowsPerBlock    = rpb;
+  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+};
+
+template <typename Policy,
+          typename Type,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  coalesced_normalize_thin_kernel(Type* out,
+                                  const Type* in,
+                                  IdxType D,
+                                  IdxType N,
+                                  Type init,
+                                  MainLambda main_op,
+                                  ReduceLambda reduce_op,
+                                  FinalLambda fin_op,
+                                  Type eps)
+{
+  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i >= N) return;
+
+  Type acc = init;
+  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+    Type val = in[j + D * i];
+    acc      = reduce_op(acc, main_op(val, j));
+  }
+  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
+  acc = fin_op(acc);
+  if (acc <= eps) return;
+  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+    out[j + D * i] = in[j + D * i] / acc;
+  }
+}
+
+template <typename Policy,
+          typename Type,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+inline void coalesced_normalize_thin(Type* out,
+                                     const Type* in,
+                                     IdxType D,
+                                     IdxType N,
+                                     Type init,
+                                     cudaStream_t stream,
+                                     MainLambda main_op,
+                                     ReduceLambda reduce_op,
+                                     FinalLambda fin_op,
+                                     Type eps)
+{
+  dim3 grid(ceildiv(N, (IdxType)Policy::RowsPerBlock), 1, 1);
+  dim3 block(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+  coalesced_normalize_thin_kernel<Policy>
+    <<<grid, block, 0, stream>>>(out, in, D, N, init, main_op, reduce_op, fin_op, eps);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <int TPB,
+          typename Type,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out,
+                                                                         const Type* in,
+                                                                         IdxType D,
+                                                                         IdxType N,
+                                                                         Type init,
+                                                                         MainLambda main_op,
+                                                                         ReduceLambda reduce_op,
+                                                                         FinalLambda fin_op,
+                                                                         Type eps)
+{
+  typedef cub::BlockReduce<Type, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ Type bcast_acc;
+  Type thread_data = init;
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    thread_data = reduce_op(thread_data, main_op(in[idx], i));
+  }
+  Type acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
+  if (threadIdx.x == 0) { bcast_acc = fin_op(acc); }
+  __syncthreads();
+  if (bcast_acc <= eps) return;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    out[idx]    = in[idx] / bcast_acc;
+  }
+}
+
+template <int TPB,
+          typename Type,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+inline void coalesced_normalize_medium(Type* out,
+                                       const Type* in,
+                                       IdxType D,
+                                       IdxType N,
+                                       Type init,
+                                       cudaStream_t stream,
+                                       MainLambda main_op,
+                                       ReduceLambda reduce_op,
+                                       FinalLambda fin_op,
+                                       Type eps)
+{
+  coalesced_normalize_medium_kernel<TPB>
+    <<<N, TPB, 0, stream>>>(out, in, D, N, init, main_op, reduce_op, fin_op, eps);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename Type,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+void coalesced_normalize(Type* out,
+                         const Type* in,
+                         IdxType D,
+                         IdxType N,
+                         Type init,
+                         cudaStream_t stream,
+                         MainLambda main_op,
+                         ReduceLambda reduce_op,
+                         FinalLambda fin_op,
+                         Type eps)
+{
+  const IdxType numSMs = raft::getMultiProcessorCount();
+  if (D <= IdxType(256) || (D <= IdxType(512) && N >= 4 * numSMs)) {
+    if (D <= IdxType(2)) {
+      coalesced_normalize_thin<NormalizeThinPolicy<2, 64>>(
+        out, in, D, N, init, stream, main_op, reduce_op, fin_op, eps);
+    } else if (D <= IdxType(4)) {
+      coalesced_normalize_thin<NormalizeThinPolicy<4, 32>>(
+        out, in, D, N, init, stream, main_op, reduce_op, fin_op, eps);
+    } else if (D <= IdxType(8)) {
+      coalesced_normalize_thin<NormalizeThinPolicy<8, 16>>(
+        out, in, D, N, init, stream, main_op, reduce_op, fin_op, eps);
+    } else if (D <= IdxType(16)) {
+      coalesced_normalize_thin<NormalizeThinPolicy<16, 8>>(
+        out, in, D, N, init, stream, main_op, reduce_op, fin_op, eps);
+    } else {
+      coalesced_normalize_thin<NormalizeThinPolicy<32, 4>>(
+        out, in, D, N, init, stream, main_op, reduce_op, fin_op, eps);
+    }
+  } else {
+    coalesced_normalize_medium<256>(out, in, D, N, init, stream, main_op, reduce_op, fin_op, eps);
+  }
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
index b956fa900e..450fb415e2 100644
--- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -57,7 +57,7 @@ __global__ void reduce_cols_by_key_kernel(
  * @param out the output reduced matrix along columns (dim = nrows x nkeys).
  * This will be assumed to be in row-major layout
  * @param nrows number of rows in the input data
- * @param ncols number of colums in the input data
+ * @param ncols number of columns in the input data
  * @param nkeys number of unique keys in the keys array
  * @param stream cuda stream to launch the kernel onto
  */
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index dc92271141..572d6b738c 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -182,7 +182,7 @@ void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
 
 //
 // Reduce by keys - large number of keys
-// Computing a "weigthed histogram" with local histograms in smem
+// Computing a "weighted histogram" with local histograms in smem
 // Keeping it simple - not optimized
 //
 
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index d68be838b0..8b5163a714 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -31,7 +31,7 @@ namespace linalg {
  * @brief Operations for all the columns or rows with a given vector.
  * Caution : Threads process multiple elements to speed up processing. These
  * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
+ * would thus only be enabled when addresses are optimally aligned for it.
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
@@ -68,7 +68,7 @@ void matrixVectorOp(MatT* out,
  * @brief Operations for all the columns or rows with the given vectors.
  * Caution : Threads process multiple elements to speed up processing. These
  * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
+ * would thus only be enabled when addresses are optimally aligned for it.
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
@@ -113,7 +113,7 @@ void matrixVectorOp(MatT* out,
  * @brief Operations for all the columns or rows with a given vector.
  * Caution : Threads process multiple elements to speed up processing. These
  * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
+ * would thus only be enabled when addresses are optimally aligned for it.
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
@@ -172,7 +172,7 @@ void matrix_vector_op(const raft::handle_t& handle,
  * @brief Operations for all the columns or rows with the given vectors.
  * Caution : Threads process multiple elements to speed up processing. These
  * are loaded in a single read thanks to type promotion. Faster processing
- * would thus only be enabled when adresses are optimally aligned for it.
+ * would thus only be enabled when addresses are optimally aligned for it.
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 389affef13..9abfd3bdb0 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -22,29 +22,24 @@
 #include "linalg_types.hpp"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/linalg/norm_types.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
 
-/** different types of norms supported on the input buffers */
-using detail::L1Norm;
-using detail::L2Norm;
-using detail::NormType;
-
 /**
  * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
  *
  * Row-wise norm is useful while computing pairwise distance matrix, for
  * example.
- * This is used in many clustering algos like knn, kmeans, dbscan, etc... The
- * current implementation is optimized only for bigger values of 'D'.
+ * This is used in many clustering algos like knn, kmeans, dbscan, etc...
  *
  * @tparam Type the data type
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
  * @param dots the output vector of row-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
+ * @param data the input matrix
  * @param D number of columns of data
  * @param N number of rows of data
  * @param type the type of norm to be applied
@@ -71,7 +66,7 @@ void rowNorm(Type* dots,
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
  * @param dots the output vector of column-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
+ * @param data the input matrix
  * @param D number of columns of data
  * @param N number of rows of data
  * @param type the type of norm to be applied
diff --git a/cpp/include/raft/linalg/norm_types.hpp b/cpp/include/raft/linalg/norm_types.hpp
new file mode 100644
index 0000000000..d399e588ce
--- /dev/null
+++ b/cpp/include/raft/linalg/norm_types.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft {
+namespace linalg {
+
+/** Enum to tell how to compute a norm */
+enum NormType : unsigned short {
+  /** L0 (actually not a norm): sum((x_i != 0 ? 1 : 0)) */
+  L0PseudoNorm = 0,
+  /** L1 norm or Manhattan: sum(abs(x_i)) */
+  L1Norm = 1,
+  /** L2 norm or Euclidean: sqrt(sum(x_i^2)). Note that in some prims the square root is optional,
+     in which case it can be specified using a boolean or a functor final_op */
+  L2Norm = 2,
+  /** Linf norm or Chebyshev: max(abs(x_i)) */
+  LinfNorm
+};
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
new file mode 100644
index 0000000000..4bdf697581
--- /dev/null
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/normalize.cuh"
+
+#include <raft/linalg/norm_types.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Divide rows by their norm defined by main_op, reduce_op and fin_op
+ *
+ * @tparam ElementType Input/Output data type
+ * @tparam IndexType Integer type used to for addressing
+ * @tparam MainLambda Type of main_op
+ * @tparam ReduceLambda Type of reduce_op
+ * @tparam FinalLambda Type of fin_op
+ * @param[in] handle raft::handle_t
+ * @param[in] in the input raft::device_matrix_view
+ * @param[out] out the output raft::device_matrix_view
+ * @param[in] init Initialization value, i.e identity element for the reduction operation
+ * @param[in] main_op Operation to apply to the elements before reducing them (e.g square for L2)
+ * @param[in] reduce_op Operation to reduce a pair of elements (e.g sum for L2)
+ * @param[in] fin_op Operation to apply once to the reduction result to finalize the norm
+ *                   computation (e.g sqrt for L2)
+ * @param[in] eps If the norm is below eps, the row is considered zero and no division is applied
+ */
+template <typename ElementType,
+          typename IndexType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+void row_normalize(const raft::handle_t& handle,
+                   raft::device_matrix_view<const ElementType, IndexType, row_major> in,
+                   raft::device_matrix_view<ElementType, IndexType, row_major> out,
+                   ElementType init,
+                   MainLambda main_op,
+                   ReduceLambda reduce_op,
+                   FinalLambda fin_op,
+                   ElementType eps = ElementType(1e-8))
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(in.extent(0) == out.extent(0),
+               "The number of rows of the input and output should be equal");
+  RAFT_EXPECTS(in.extent(1) == out.extent(1),
+               "The number of columns of the input and output should be equal");
+
+  detail::coalesced_normalize(out.data_handle(),
+                              in.data_handle(),
+                              in.extent(1),
+                              in.extent(0),
+                              init,
+                              handle.get_stream(),
+                              main_op,
+                              reduce_op,
+                              fin_op,
+                              eps);
+}
+
+/**
+ * @brief Divide rows by their norm.
+ *
+ * @tparam ElementType Input/Output data type
+ * @tparam IndexType Integer type used to for addressing
+ * @param[in] handle raft::handle_t
+ * @param[in] in the input raft::device_matrix_view
+ * @param[out] out the output raft::device_matrix_view
+ * @param[in] norm_type the type of norm to be applied
+ * @param[in] eps If the norm is below eps, the row is considered zero and no division is applied
+ */
+template <typename ElementType, typename IndexType>
+void row_normalize(const raft::handle_t& handle,
+                   raft::device_matrix_view<const ElementType, IndexType, row_major> in,
+                   raft::device_matrix_view<ElementType, IndexType, row_major> out,
+                   NormType norm_type,
+                   ElementType eps = ElementType(1e-8))
+{
+  switch (norm_type) {
+    case L1Norm:
+      row_normalize(handle,
+                    in,
+                    out,
+                    ElementType(0),
+                    raft::L1Op<ElementType>(),
+                    raft::Sum<ElementType>(),
+                    raft::Nop<ElementType>(),
+                    eps);
+      break;
+    case L2Norm:
+      row_normalize(handle,
+                    in,
+                    out,
+                    ElementType(0),
+                    raft::L2Op<ElementType>(),
+                    raft::Sum<ElementType>(),
+                    raft::SqrtOp<ElementType>(),
+                    eps);
+      break;
+    case LinfNorm:
+      row_normalize(handle,
+                    in,
+                    out,
+                    ElementType(0),
+                    raft::L1Op<ElementType>(),
+                    raft::Max<ElementType>(),
+                    raft::Nop<ElementType>(),
+                    eps);
+      break;
+    default: THROW("Unsupported norm type: %d", norm_type);
+  }
+}
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 9b3f4ee347..5579acf355 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -117,17 +117,17 @@ void reduce(OutType* dots,
 template <typename InElementType,
           typename LayoutPolicy,
           typename OutElementType = InElementType,
-          typename IndexType      = std::uint32_t,
-          typename MainLambda     = raft::Nop<InElementType>,
+          typename IdxType        = std::uint32_t,
+          typename MainLambda     = raft::Nop<InElementType, IdxType>,
           typename ReduceLambda   = raft::Sum<OutElementType>,
           typename FinalLambda    = raft::Nop<OutElementType>>
 void reduce(const raft::handle_t& handle,
-            raft::device_matrix_view<const InElementType, IndexType, LayoutPolicy> data,
-            raft::device_vector_view<OutElementType, IndexType> dots,
+            raft::device_matrix_view<const InElementType, IdxType, LayoutPolicy> data,
+            raft::device_vector_view<OutElementType, IdxType> dots,
             OutElementType init,
             Apply apply,
             bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InElementType>(),
+            MainLambda main_op     = raft::Nop<InElementType, IdxType>(),
             ReduceLambda reduce_op = raft::Sum<OutElementType>(),
             FinalLambda final_op   = raft::Nop<OutElementType>())
 {
@@ -137,10 +137,10 @@ void reduce(const raft::handle_t& handle,
   bool along_rows          = apply == Apply::ALONG_ROWS;
 
   if (along_rows) {
-    RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(1),
+    RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(1),
                  "Output should be equal to number of columns in Input");
   } else {
-    RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(0),
+    RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(0),
                  "Output should be equal to number of rows in Input");
   }
 
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index a7917f21f8..436fce26fd 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -40,7 +40,7 @@ namespace linalg {
  * @param out the output reduced matrix along columns (dim = nrows x nkeys).
  * This will be assumed to be in row-major layout
  * @param nrows number of rows in the input data
- * @param ncols number of colums in the input data
+ * @param ncols number of columns in the input data
  * @param nkeys number of unique keys in the keys array
  * @param stream cuda stream to launch the kernel onto
  */
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index be2f5f0286..6f0315642b 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -148,13 +148,19 @@ void rsvdPerc(const raft::handle_t& handle,
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using QR decomposition, by specifying no. of PCs and
  * upsamples directly
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -216,13 +222,19 @@ void rsvd_fixed_rank(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using symmetric Eigen decomposition, by specifying no. of PCs and
  * upsamples directly. The rectangular input matrix is made square and symmetric using B @ B^T
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -285,15 +297,21 @@ void rsvd_fixed_rank_symmetric(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using Jacobi method, by specifying no. of PCs and
  * upsamples directly
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
  * @param[in] tol tolerance for Jacobi-based solvers
  * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -357,15 +375,21 @@ void rsvd_fixed_rank_jacobi(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using Jacobi method, by specifying no. of PCs and
  * upsamples directly. The rectangular input matrix is made square and symmetric using B @ B^T
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
  * @param[in] tol tolerance for Jacobi-based solvers
  * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -430,14 +454,20 @@ void rsvd_fixed_rank_symmetric_jacobi(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using QR decomposition, by specifying the PC and upsampling
  * ratio
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
  * @param[in] UpS_perc upsampling percentage
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -490,7 +520,7 @@ void rsvd_perc(const raft::handle_t& handle,
  *
  * Please see above for documentation of `rsvd_perc`.
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 4>>
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 5>>
 void rsvd_perc(Args... args)
 {
   rsvd_perc(std::forward<Args>(args)..., std::nullopt, std::nullopt);
@@ -500,14 +530,20 @@ void rsvd_perc(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using symmetric Eigen decomposition, by specifying the PC and upsampling
  * ratio. The rectangular input matrix is made square and symmetric using B @ B^T
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
  * @param[in] UpS_perc upsampling percentage
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -560,7 +596,7 @@ void rsvd_perc_symmetric(const raft::handle_t& handle,
  *
  * Please see above for documentation of `rsvd_perc_symmetric`.
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 4>>
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 5>>
 void rsvd_perc_symmetric(Args... args)
 {
   rsvd_perc_symmetric(std::forward<Args>(args)..., std::nullopt, std::nullopt);
@@ -570,6 +606,12 @@ void rsvd_perc_symmetric(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using Jacobi method, by specifying the PC and upsampling
  * ratio
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
@@ -577,9 +619,9 @@ void rsvd_perc_symmetric(Args... args)
  * @param[in] UpS_perc upsampling percentage
  * @param[in] tol tolerance for Jacobi-based solvers
  * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -634,7 +676,7 @@ void rsvd_perc_jacobi(const raft::handle_t& handle,
  *
  * Please see above for documentation of `rsvd_perc_jacobi`.
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 6>>
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 7>>
 void rsvd_perc_jacobi(Args... args)
 {
   rsvd_perc_jacobi(std::forward<Args>(args)..., std::nullopt, std::nullopt);
@@ -644,6 +686,12 @@ void rsvd_perc_jacobi(Args... args)
  * @brief randomized singular value decomposition (RSVD) on a column major
  * rectangular matrix using Jacobi method, by specifying the PC and upsampling
  * ratio. The rectangular input matrix is made square and symmetric using B @ B^T
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
@@ -651,9 +699,9 @@ void rsvd_perc_jacobi(Args... args)
  * @param[in] UpS_perc upsampling percentage
  * @param[in] tol tolerance for Jacobi-based solvers
  * @param[in] max_sweeps maximum number of sweeps for Jacobi-based solvers
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major
- * @param[out] V_in optional right singular values of raft::device_matrix_view with layout
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with layout
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -709,7 +757,7 @@ void rsvd_perc_symmetric_jacobi(
  *
  * Please see above for documentation of `rsvd_perc_symmetric_jacobi`.
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 6>>
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 7>>
 void rsvd_perc_symmetric_jacobi(Args... args)
 {
   rsvd_perc_symmetric_jacobi(std::forward<Args>(args)..., std::nullopt, std::nullopt);
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index 9147692c03..0aa4aecef5 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -24,6 +24,8 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/handle.hpp>
 
+#include <type_traits>
+
 namespace raft {
 namespace linalg {
 
@@ -71,8 +73,16 @@ void stridedReduction(OutType* dots,
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
                       FinalLambda final_op   = raft::Nop<OutType>())
 {
-  detail::stridedReduction<InType, OutType, IdxType>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  // Only compile for types supported by myAtomicReduce, but don't make the compilation fail in
+  // other cases, because coalescedReduction supports arbitrary types.
+  if constexpr (std::is_same_v<OutType, float> || std::is_same_v<OutType, double> ||
+                std::is_same_v<OutType, int> || std::is_same_v<OutType, long long> ||
+                std::is_same_v<OutType, unsigned long long>) {
+    detail::stridedReduction<InType, OutType, IdxType>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    THROW("Unsupported type for stridedReduction: %s", typeid(OutType).name());
+  }
 }
 
 /**
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 4f81822a13..e6f2fa8724 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -67,7 +67,7 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream
   detail::subtract(out, in1, in2, len, stream);
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+/** Subtract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
  * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index fb30f17477..7be1b9d63c 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -189,12 +189,18 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
 /**
  * @brief singular value decomposition (SVD) on a column major
  * matrix using QR decomposition
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] sing_vals singular values raft::device_vector_view of shape (K)
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major and dimensions (m, n)
- * @param[out] V_in optional right singular values of raft::device_matrix_view with
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with
  * layout raft::col_major and dimensions (n, n)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -237,7 +243,7 @@ void svd_qr(const raft::handle_t& handle,
  *
  * Please see above for documentation of `svd_qr`.
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 2>>
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 3>>
 void svd_qr(Args... args)
 {
   svd_qr(std::forward<Args>(args)..., std::nullopt, std::nullopt);
@@ -246,12 +252,18 @@ void svd_qr(Args... args)
 /**
  * @brief singular value decomposition (SVD) on a column major
  * matrix using QR decomposition. Right singular vector matrix is transposed before returning
+ * @tparam ValueType value type of parameters
+ * @tparam IndexType index type of parameters
+ * @tparam UType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * U_in
+ * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
+ * V_in
  * @param[in] handle raft::handle_t
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] sing_vals singular values raft::device_vector_view of shape (K)
- * @param[out] U_in optional left singular values of raft::device_matrix_view with layout
+ * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
  * raft::col_major and dimensions (m, n)
- * @param[out] V_in optional right singular values of raft::device_matrix_view with
+ * @param[out] V_in std::optional right singular values of raft::device_matrix_view with
  * layout raft::col_major and dimensions (n, n)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
@@ -295,7 +307,7 @@ void svd_qr_transpose_right_vec(
  *
  * Please see above for documentation of `svd_qr_transpose_right_vec`.
  */
-template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 2>>
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 3>>
 void svd_qr_transpose_right_vec(Args... args)
 {
   svd_qr_transpose_right_vec(std::forward<Args>(args)..., std::nullopt, std::nullopt);
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index d26f5f73cf..5f9b3ab848 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -58,18 +58,22 @@ void sort_cols_per_row(const InType* in,
  * @tparam in_t: element type of input matrix
  * @tparam out_t: element type of output matrix
  * @tparam matrix_idx_t: integer type for matrix indexing
+ * @tparam sorted_keys_t: std::optional<raft::device_matrix_view<in_t, matrix_idx_t,
+ * raft::row_major>> @c sorted_keys_opt
  * @param[in] handle: raft handle
  * @param[in] in: input matrix
  * @param[out] out: output value(index) matrix
- * @param[out] sorted_keys: Optional, output matrix for sorted keys (input)
+ * @param[out] sorted_keys_opt: std::optional, output matrix for sorted keys (input)
  */
-template <typename in_t, typename out_t, typename matrix_idx_t>
+template <typename in_t, typename out_t, typename matrix_idx_t, typename sorted_keys_t>
 void sort_cols_per_row(const raft::handle_t& handle,
                        raft::device_matrix_view<const in_t, matrix_idx_t, raft::row_major> in,
                        raft::device_matrix_view<out_t, matrix_idx_t, raft::row_major> out,
-                       std::optional<raft::device_matrix_view<in_t, matrix_idx_t, raft::row_major>>
-                         sorted_keys = std::nullopt)
+                       sorted_keys_t&& sorted_keys_opt)
 {
+  std::optional<raft::device_matrix_view<in_t, matrix_idx_t, raft::row_major>> sorted_keys =
+    std::forward<sorted_keys_t>(sorted_keys_opt);
+
   RAFT_EXPECTS(in.extent(1) == out.extent(1) && in.extent(0) == out.extent(0),
                "Input and output matrices must have the same shape.");
 
@@ -109,26 +113,6 @@ void sort_cols_per_row(const raft::handle_t& handle,
   }
 }
 
-namespace sort_cols_per_row_impl {
-template <typename T>
-struct sorted_keys_alias {
-};
-
-template <>
-struct sorted_keys_alias<std::nullopt_t> {
-  using type = double;
-};
-
-template <typename in_t, typename matrix_idx_t>
-struct sorted_keys_alias<
-  std::optional<raft::device_matrix_view<in_t, matrix_idx_t, raft::row_major>>> {
-  using type = typename raft::device_matrix_view<in_t, matrix_idx_t, raft::row_major>::value_type;
-};
-
-template <typename T>
-using sorted_keys_t = typename sorted_keys_alias<T>::type;
-}  // namespace sort_cols_per_row_impl
-
 /**
  * @brief Overload of `sort_keys_per_row` to help the
  *   compiler find the above overload, in case users pass in
@@ -136,18 +120,10 @@ using sorted_keys_t = typename sorted_keys_alias<T>::type;
  *
  * Please see above for documentation of `sort_keys_per_row`.
  */
-template <typename in_t, typename out_t, typename matrix_idx_t, typename sorted_keys_vector_type>
-void sort_cols_per_row(const raft::handle_t& handle,
-                       raft::device_matrix_view<const in_t, matrix_idx_t, raft::row_major> in,
-                       raft::device_matrix_view<out_t, matrix_idx_t, raft::row_major> out,
-                       sorted_keys_vector_type sorted_keys)
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 3>>
+void sort_cols_per_row(Args... args)
 {
-  using sorted_keys_type = sort_cols_per_row_impl::sorted_keys_t<
-    std::remove_const_t<std::remove_reference_t<sorted_keys_vector_type>>>;
-  std::optional<raft::device_matrix_view<in_t, matrix_idx_t, raft::row_major>> sorted_keys_opt =
-    std::forward<sorted_keys_vector_type>(sorted_keys);
-
-  sort_cols_per_row(handle, in, out, sorted_keys_opt);
+  sort_cols_per_row(std::forward<Args>(args)..., std::nullopt);
 }
 
 };  // end namespace raft::matrix
diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
index 97345aecb6..5df7ba3cdc 100644
--- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh
+++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
@@ -229,7 +229,7 @@ void sortColumnsPerRow(const InType* in,
     // will give better perf than below deviceWide Sort for even larger dims
     int numSegments = n_rows + 1;
 
-    // need auxillary storage: cub sorting + keys (if user not passing) +
+    // need auxiliary storage: cub sorting + keys (if user not passing) +
     // staging for values out + segment partition
     if (workspaceSize == 0 || !workspacePtr) {
       OutType* tmpValIn    = nullptr;
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index dd1da1e498..3738afba5d 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -109,7 +109,7 @@ void gatherImpl(const MatrixIteratorT in,
   // stencil value type
   typedef typename std::iterator_traits<StencilIteratorT>::value_type StencilValueT;
 
-  // return type of MapTransformOp, must be convertable to IndexT
+  // return type of MapTransformOp, must be convertible to IndexT
   typedef typename std::result_of<decltype(transform_op)(MapValueT)>::type MapTransformOpReturnT;
   static_assert((std::is_convertible<MapTransformOpReturnT, IndexT>::value),
                 "MapTransformOp's result type must be convertible to signed integer");
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 37198684ee..605726bea6 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -63,7 +63,7 @@ struct Linewise {
    *
    * Block work arrangement: blocked;
    *     one warp works on a contiguous chunk of a matrix. Since the matrix is represented
-   *     as a flat array, such an arangement minimizes the number of times when a single
+   *     as a flat array, such an arrangement minimizes the number of times when a single
    *     thread needs to reload the vector value at an index corresponding to the current
    *     matrix row. Ideally, a thread would load a value from a vector only once, but that
    *     is not possible if the vector size (= number of matrix rows) is too small or not
@@ -483,7 +483,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                    L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
   } else {
     // second block: offset = arrTail, length = len - arrTail
-    // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
+    // NB: I subtract MaxOffset (= blockDim.x) to get the correct indexing for block 1
     L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out + arrTail - MaxOffset),
                   reinterpret_cast<const typename L::Vec::io_t*>(in + arrTail - MaxOffset),
                   len - arrTail + MaxOffset,
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
new file mode 100644
index 0000000000..c838af85d6
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/spatial/knn/detail/ivf_flat_build.cuh>
+#include <raft/spatial/knn/detail/ivf_flat_search.cuh>
+
+#include <cstdlib>
+#include <omp.h>
+
+#include <thrust/sequence.h>
+
+namespace raft::neighbors::detail {
+
+/** Checks whether the input data extents are compatible. */
+template <typename extents_t>
+void check_input(extents_t dataset,
+                 extents_t queries,
+                 extents_t candidates,
+                 extents_t indices,
+                 extents_t distances,
+                 distance::DistanceType metric)
+{
+  auto n_queries = queries.extent(0);
+  auto k         = distances.extent(1);
+
+  RAFT_EXPECTS(k <= raft::spatial::knn::detail::topk::kMaxCapacity,
+               "k must be lest than topk::kMaxCapacity (%d).",
+               raft::spatial::knn::detail::topk::kMaxCapacity);
+
+  RAFT_EXPECTS(indices.extent(0) == n_queries && distances.extent(0) == n_queries &&
+                 candidates.extent(0) == n_queries,
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in search matrix.");
+
+  RAFT_EXPECTS(indices.extent(1) == k,
+               "Number of columns in output indices and distances matrices must be equal to k");
+
+  RAFT_EXPECTS(queries.extent(1) == dataset.extent(1),
+               "Number of columns must be equal for dataset and queries");
+
+  RAFT_EXPECTS(candidates.extent(1) >= k,
+               "Number of neighbor candidates must not be smaller than k (%d vs %d)",
+               static_cast<int>(candidates.extent(1)),
+               static_cast<int>(k));
+}
+
+/**
+ * See raft::neighbors::refine for docs.
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine_device(raft::handle_t const& handle,
+                   raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+                   raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+                   raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+                   raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+                   raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+                   distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+{
+  matrix_idx n_candidates = neighbor_candidates.extent(1);
+  matrix_idx n_queries    = queries.extent(0);
+  matrix_idx dim          = dataset.extent(1);
+  uint32_t k              = static_cast<uint32_t>(indices.extent(1));
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "neighbors::refine(%zu, %u)", size_t(n_queries), uint32_t(n_candidates));
+
+  check_input(dataset.extents(),
+              queries.extents(),
+              neighbor_candidates.extents(),
+              indices.extents(),
+              distances.extents(),
+              metric);
+
+  // The refinement search can be mapped to an IVF flat search:
+  // - We consider that the candidate vectors form a cluster, separately for each query.
+  // - In other words, the n_queries * n_candidates vectors form n_queries clusters, each with
+  //   n_candidates elements.
+  // - We consider that the coarse level search is already performed and assigned a single cluster
+  //   to search for each query (the cluster formed from the corresponding candidates).
+  // - We run IVF flat search with n_probes=1 to select the best k elements of the candidates.
+  rmm::device_uvector<uint32_t> fake_coarse_idx(n_queries, handle.get_stream());
+
+  thrust::sequence(
+    handle.get_thrust_policy(), fake_coarse_idx.data(), fake_coarse_idx.data() + n_queries);
+
+  raft::neighbors::ivf_flat::index<data_t, idx_t> refinement_index(
+    handle, metric, n_queries, false, dim);
+
+  raft::spatial::knn::ivf_flat::detail::fill_refinement_index(handle,
+                                                              &refinement_index,
+                                                              dataset.data_handle(),
+                                                              neighbor_candidates.data_handle(),
+                                                              n_queries,
+                                                              n_candidates);
+
+  uint32_t grid_dim_x = 1;
+  raft::spatial::knn::ivf_flat::detail::ivfflat_interleaved_scan<
+    data_t,
+    typename raft::spatial::knn::detail::utils::config<data_t>::value_t,
+    idx_t>(refinement_index,
+           queries.data_handle(),
+           fake_coarse_idx.data(),
+           static_cast<uint32_t>(n_queries),
+           refinement_index.metric(),
+           1,
+           k,
+           raft::spatial::knn::ivf_flat::detail::is_min_close(metric),
+           indices.data_handle(),
+           distances.data_handle(),
+           grid_dim_x,
+           handle.get_stream());
+}
+
+/** Helper structure for naive CPU implementation of refine. */
+typedef struct {
+  uint64_t id;
+  float distance;
+} struct_for_refinement;
+
+int _postprocessing_qsort_compare(const void* v1, const void* v2)
+{
+  // sort in ascending order
+  if (((struct_for_refinement*)v1)->distance > ((struct_for_refinement*)v2)->distance) {
+    return 1;
+  } else if (((struct_for_refinement*)v1)->distance < ((struct_for_refinement*)v2)->distance) {
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+/**
+ * Naive CPU implementation of refine operation
+ *
+ * All pointers are expected to be accessible on the host.
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine_host(raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+                 raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+                 raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+                 raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+                 raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+                 distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+{
+  check_input(dataset.extents(),
+              queries.extents(),
+              neighbor_candidates.extents(),
+              indices.extents(),
+              distances.extents(),
+              metric);
+
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded: break;
+    case raft::distance::DistanceType::InnerProduct: break;
+    default: throw raft::logic_error("Unsopported metric");
+  }
+
+  size_t numDataset            = dataset.extent(0);
+  size_t numQueries            = queries.extent(0);
+  size_t dimDataset            = dataset.extent(1);
+  const data_t* dataset_ptr    = dataset.data_handle();
+  const data_t* queries_ptr    = queries.data_handle();
+  const idx_t* neighbors       = neighbor_candidates.data_handle();
+  idx_t topK                   = neighbor_candidates.extent(1);
+  idx_t refinedTopK            = indices.extent(1);
+  idx_t* refinedNeighbors      = indices.data_handle();
+  distance_t* refinedDistances = distances.data_handle();
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "neighbors::refine_host(%zu, %u)", size_t(numQueries), uint32_t(topK));
+
+#pragma omp parallel
+  {
+    struct_for_refinement* sfr =
+      (struct_for_refinement*)malloc(sizeof(struct_for_refinement) * topK);
+    for (size_t i = omp_get_thread_num(); i < numQueries; i += omp_get_num_threads()) {
+      // compute distance with original dataset vectors
+      const data_t* cur_query = queries_ptr + ((uint64_t)dimDataset * i);
+      for (size_t j = 0; j < (size_t)topK; j++) {
+        idx_t id                  = neighbors[j + (topK * i)];
+        const data_t* cur_dataset = dataset_ptr + ((uint64_t)dimDataset * id);
+        float distance            = 0.0;
+        for (size_t k = 0; k < (size_t)dimDataset; k++) {
+          float val_q = (float)(cur_query[k]);
+          float val_d = (float)(cur_dataset[k]);
+          if (metric == raft::distance::DistanceType::InnerProduct) {
+            distance += -val_q * val_d;  // Negate because we sort in scending order.
+          } else {
+            distance += (val_q - val_d) * (val_q - val_d);
+          }
+        }
+        sfr[j].id       = id;
+        sfr[j].distance = distance;
+      }
+
+      qsort(sfr, topK, sizeof(struct_for_refinement), _postprocessing_qsort_compare);
+
+      for (size_t j = 0; j < (size_t)refinedTopK; j++) {
+        refinedNeighbors[j + (refinedTopK * i)] = sfr[j].id;
+        if (refinedDistances == NULL) continue;
+        if (metric == raft::distance::DistanceType::InnerProduct) {
+          refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance;
+        } else {
+          refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance;
+        }
+      }
+    }
+    free(sfr);
+  }
+}
+
+}  // namespace raft::neighbors::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index c7e3798f5d..44b88a0b23 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -24,6 +24,7 @@
 #include <raft/util/integer_utils.hpp>
 
 #include <optional>
+#include <type_traits>
 
 namespace raft::neighbors::ivf_flat {
 
@@ -37,6 +38,19 @@ struct index_params : ann::index_params {
   uint32_t kmeans_n_iters = 20;
   /** The fraction of data to use during iterative kmeans building. */
   double kmeans_trainset_fraction = 0.5;
+  /**
+   * By default (adaptive_centers = false), the cluster centers are trained in `ivf_flat::build`,
+   * and never modified in `ivf_flat::extend`. As a result, you may need to retrain the index
+   * from scratch after invoking (`ivf_flat::extend`) a few times with new data, the distribution of
+   * which is no longer representative of the original training set.
+   *
+   * The alternative behavior (adaptive_centers = true) is to update the cluster centers for new
+   * data when it is added. In this case, `index.centers()` are always exactly the centroids of the
+   * data in the corresponding clusters. The drawback of this behavior is that the centroids depend
+   * on the order of adding new data (through the classification of the added data); that is,
+   * `index.centers()` "drift" together with the changing distribution of the newly added data.
+   */
+  bool adaptive_centers = false;
 };
 
 struct search_params : ann::search_params {
@@ -72,6 +86,11 @@ struct index : ann::index {
   {
     return metric_;
   }
+  /** Whether `centers()` change upon extending the index (ivf_pq::extend). */
+  [[nodiscard]] constexpr inline auto adaptive_centers() const noexcept -> bool
+  {
+    return adaptive_centers_;
+  }
   /**
    * Inverted list data [size, dim].
    *
@@ -200,10 +219,15 @@ struct index : ann::index {
   ~index()                          = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle, raft::distance::DistanceType metric, uint32_t n_lists, uint32_t dim)
+  index(const handle_t& handle,
+        raft::distance::DistanceType metric,
+        uint32_t n_lists,
+        bool adaptive_centers,
+        uint32_t dim)
     : ann::index(),
       veclen_(calculate_veclen(dim)),
       metric_(metric),
+      adaptive_centers_(adaptive_centers),
       data_(make_device_mdarray<T>(handle, make_extents<IdxT>(0, dim))),
       indices_(make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))),
       list_sizes_(make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(n_lists))),
@@ -216,7 +240,7 @@ struct index : ann::index {
 
   /** Construct an empty index. It needs to be trained and then populated. */
   index(const handle_t& handle, const index_params& params, uint32_t dim)
-    : index(handle, params.metric, params.n_lists, dim)
+    : index(handle, params.metric, params.n_lists, params.adaptive_centers, dim)
   {
   }
 
@@ -242,6 +266,7 @@ struct index : ann::index {
    */
   uint32_t veclen_;
   raft::distance::DistanceType metric_;
+  bool adaptive_centers_;
   device_mdarray<T, extent_2d<IdxT>, row_major> data_;
   device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
   device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 207e298947..5b2035fadf 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -53,7 +53,7 @@ namespace raft::neighbors::ivf_pq {
  *
  * @param handle
  * @param params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
  * @param n_rows the number of samples
  * @param dim the dimensionality of the data
  *
@@ -91,8 +91,8 @@ inline auto build(
  *
  * @param handle
  * @param orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
  *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param n_rows the number of samples
@@ -118,8 +118,8 @@ inline auto extend(const handle_t& handle,
  *
  * @param handle
  * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
  *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param n_rows the number of samples
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 3dbf004e95..afb3eb6cd6 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -23,6 +23,8 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <thrust/fill.h>
+
 #include <type_traits>
 
 namespace raft::neighbors::ivf_pq {
@@ -108,17 +110,30 @@ struct search_params : ann::search_params {
    */
   cudaDataType_t internal_distance_dtype = CUDA_R_32F;
   /**
-   * Thread block size of the distance calculation kernel at search time.
-   * When zero, an optimal block size is selected using a heuristic.
+   * Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
+   *
+   * Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
+   *
+   * One wants to increase the carveout to make sure a good GPU occupancy for the main search
+   * kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
+   * value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
+   * configurations, so the provided value is rounded up to the nearest configuration. Refer to the
+   * NVIDIA tuning guide for the target GPU architecture.
    *
-   * Possible values: [0, 256, 512, 1024]
+   * Note, this is a low-level tuning parameter that can have drastic negative effects on the search
+   * performance if tweaked incorrectly.
    */
-  uint32_t preferred_thread_block_size = 0;
+  double preferred_shmem_carveout = 1.0;
 };
 
 static_assert(std::is_aggregate_v<index_params>);
 static_assert(std::is_aggregate_v<search_params>);
 
+/** Size of the interleaved group. */
+constexpr static uint32_t kIndexGroupSize = 32;
+/** Stride of the interleaved group for vectorized loads. */
+constexpr static uint32_t kIndexGroupVecLen = 16;
+
 /**
  * @brief IVF-PQ index.
  *
@@ -170,6 +185,19 @@ struct index : ann::index {
                 "IdxT must be able to represent all values of uint32_t");
 
  public:
+  /**
+   * Default value filled in the `indices()` array.
+   * One may encounter it trying to access a record within a cluster that is outside of the
+   * `list_sizes()` bound (due to the record alignment `kIndexGroupSize`).
+   */
+  constexpr static IdxT kInvalidRecord = std::numeric_limits<IdxT>::max() - 1;
+  /**
+   * Default value returned by `search` when the `n_probes` is too small and top-k is too large.
+   * One may encounter it if the combined size of probed clusters is smaller than the requested
+   * number of results per query.
+   */
+  constexpr static IdxT kOutOfBoundsRecord = std::numeric_limits<IdxT>::max();
+
   /** Total length of the index. */
   [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return indices_.extent(0); }
   /** Dimensionality of the input data. */
@@ -247,12 +275,12 @@ struct index : ann::index {
       pq_dim_(pq_dim == 0 ? calculate_pq_dim(dim) : pq_dim),
       n_nonempty_lists_(n_nonempty_lists),
       pq_centers_{make_device_mdarray<float>(handle, make_pq_centers_extents())},
-      pq_dataset_{make_device_mdarray<uint8_t>(
-        handle, make_extents<IdxT>(0, this->pq_dim() * this->pq_bits() / 8))},
+      pq_dataset_{make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(0))},
       indices_{make_device_mdarray<IdxT>(handle, make_extents<IdxT>(0))},
       rotation_matrix_{
         make_device_mdarray<float>(handle, make_extents<uint32_t>(this->rot_dim(), this->dim()))},
       list_offsets_{make_device_mdarray<IdxT>(handle, make_extents<uint32_t>(this->n_lists() + 1))},
+      list_sizes_{make_device_mdarray<uint32_t>(handle, make_extents<uint32_t>(this->n_lists()))},
       centers_{make_device_mdarray<float>(
         handle, make_extents<uint32_t>(this->n_lists(), this->dim_ext()))},
       centers_rot_{make_device_mdarray<float>(
@@ -283,35 +311,49 @@ struct index : ann::index {
    */
   void allocate(const handle_t& handle, IdxT index_size)
   {
-    pq_dataset_ =
-      make_device_mdarray<uint8_t>(handle, make_extents<IdxT>(index_size, pq_dataset_.extent(1)));
-    indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
+    indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    if (index_size > 0) {
+      thrust::fill_n(
+        handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord);
+    }
     check_consistency();
   }
 
+  using pq_centers_extents =
+    std::experimental::extents<uint32_t, dynamic_extent, dynamic_extent, dynamic_extent>;
   /**
    * PQ cluster centers
    *
-   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_book_size, pq_len]
-   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_book_size, pq_len]
+   *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
+   *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
    */
-  inline auto pq_centers() noexcept -> device_mdspan<float, extent_3d<uint32_t>, row_major>
+  inline auto pq_centers() noexcept -> device_mdspan<float, pq_centers_extents, row_major>
   {
     return pq_centers_.view();
   }
   [[nodiscard]] inline auto pq_centers() const noexcept
-    -> device_mdspan<const float, extent_3d<uint32_t>, row_major>
+    -> device_mdspan<const float, pq_centers_extents, row_major>
   {
     return pq_centers_.view();
   }
 
-  /** PQ-encoded data [size, pq_dim * pq_bits / 8]. */
-  inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, extent_2d<IdxT>, row_major>
+  using pq_dataset_extents = std::experimental::
+    extents<IdxT, dynamic_extent, dynamic_extent, kIndexGroupSize, kIndexGroupVecLen>;
+  /** PQ-encoded data stored in the interleaved format:
+   *
+   *    [ ceildiv(size, kIndexGroupSize)
+   *    , ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits)
+   *    , kIndexGroupSize
+   *    , kIndexGroupVecLen
+   *    ].
+   */
+  inline auto pq_dataset() noexcept -> device_mdspan<uint8_t, pq_dataset_extents, row_major>
   {
     return pq_dataset_.view();
   }
   [[nodiscard]] inline auto pq_dataset() const noexcept
-    -> device_mdspan<const uint8_t, extent_2d<IdxT>, row_major>
+    -> device_mdspan<const uint8_t, pq_dataset_extents, row_major>
   {
     return pq_dataset_.view();
   }
@@ -352,6 +394,17 @@ struct index : ann::index {
     return list_offsets_.view();
   }
 
+  /** Sizes of the lists [n_lists]. */
+  inline auto list_sizes() noexcept -> device_mdspan<uint32_t, extent_1d<uint32_t>, row_major>
+  {
+    return list_sizes_.view();
+  }
+  [[nodiscard]] inline auto list_sizes() const noexcept
+    -> device_mdspan<const uint32_t, extent_1d<uint32_t>, row_major>
+  {
+    return list_sizes_.view();
+  }
+
   /** Cluster centers corresponding to the lists in the original space [n_lists, dim_ext] */
   inline auto centers() noexcept -> device_mdspan<float, extent_2d<uint32_t>, row_major>
   {
@@ -374,6 +427,18 @@ struct index : ann::index {
     return centers_rot_.view();
   }
 
+  /** A helper function to determine the extents of an array enough to hold a given amount of data.
+   */
+  auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
+  {
+    // how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
+    auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
+    return make_extents<IdxT>(raft::div_rounding_up_safe<IdxT>(n_rows, kIndexGroupSize),
+                              raft::div_rounding_up_safe<IdxT>(pq_dim(), pq_chunk),
+                              kIndexGroupSize,
+                              kIndexGroupVecLen);
+  }
+
  private:
   raft::distance::DistanceType metric_;
   codebook_gen codebook_kind_;
@@ -383,11 +448,12 @@ struct index : ann::index {
   uint32_t pq_dim_;
   uint32_t n_nonempty_lists_;
 
-  device_mdarray<float, extent_3d<uint32_t>, row_major> pq_centers_;
-  device_mdarray<uint8_t, extent_2d<IdxT>, row_major> pq_dataset_;
+  device_mdarray<float, pq_centers_extents, row_major> pq_centers_;
+  device_mdarray<uint8_t, pq_dataset_extents, row_major> pq_dataset_;
   device_mdarray<IdxT, extent_1d<IdxT>, row_major> indices_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> rotation_matrix_;
   device_mdarray<IdxT, extent_1d<uint32_t>, row_major> list_offsets_;
+  device_mdarray<uint32_t, extent_1d<uint32_t>, row_major> list_sizes_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> centers_;
   device_mdarray<float, extent_2d<uint32_t>, row_major> centers_rot_;
 
@@ -404,13 +470,13 @@ struct index : ann::index {
                  pq_bits() * pq_dim());
   }
 
-  auto make_pq_centers_extents() -> extent_3d<uint32_t>
+  auto make_pq_centers_extents() -> pq_centers_extents
   {
     switch (codebook_kind()) {
       case codebook_gen::PER_SUBSPACE:
-        return make_extents<uint32_t>(pq_dim(), pq_book_size(), pq_len());
+        return make_extents<uint32_t>(pq_dim(), pq_len(), pq_book_size());
       case codebook_gen::PER_CLUSTER:
-        return make_extents<uint32_t>(n_lists(), pq_book_size(), pq_len());
+        return make_extents<uint32_t>(n_lists(), pq_len(), pq_book_size());
       default: RAFT_FAIL("Unreachable code");
     }
   }
@@ -420,7 +486,7 @@ struct index : ann::index {
     // If the dimensionality is large enough, we can reduce it to improve performance
     if (dim >= 128) { dim /= 2; }
     // Round it down to 32 to improve performance.
-    uint32_t r = raft::round_down_safe<uint32_t>(dim, 32);
+    auto r = raft::round_down_safe<uint32_t>(dim, 32);
     if (r > 0) return r;
     // If the dimensionality is really low, round it to the closest power-of-two
     r = 1;
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
new file mode 100644
index 0000000000..7b6708f18c
--- /dev/null
+++ b/cpp/include/raft/neighbors/refine.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/neighbors/detail/refine.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
+namespace raft::neighbors {
+
+/**
+ * @brief Refine nearest neighbor search.
+ *
+ * Refinement is an operation that follows an approximate NN search. The approximate search has
+ * already selected n_candidates neighbor candidates for each query. We narrow it down to k
+ * neighbors. For each query, we calculate the exact distance between the query and its
+ * n_candidates neighbor candidate, and select the k nearest ones.
+ *
+ * The k nearest neighbors and distances are returned.
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search m = 4 * k nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, N, 4 * k, neighbor_candidates,
+ *                  out_dists_tmp);
+ *   // refine it to the k nearest one
+ *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
+ *           index.metric());
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries device matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
+ *   n_candidates >= k
+ * @param[out] indices device matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances device matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::handle_t const& handle,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+            distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+{
+  detail::refine_device(handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+/** Same as above, but all input and out data is in host memory.
+ * @param[in] handle the raft handle
+ * @param[in] dataset host matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries host matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates host matrix with indices of candidate vectors [n_queries,
+ *   n_candidates], where n_candidates >= k
+ * @param[out] indices host matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances host matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::handle_t const& handle,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+            distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+{
+  detail::refine_host(dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq_specialization.hpp b/cpp/include/raft/neighbors/specializations/ivf_pq_specialization.hpp
new file mode 100644
index 0000000000..2bce997e18
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/ivf_pq_specialization.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+namespace raft::neighbors ::ivf_pq {
+
+#define RAFT_INST_SEARCH(T, IdxT)   \
+  void search(const handle_t&,      \
+              const search_params&, \
+              const index<IdxT>&,   \
+              const T*,             \
+              uint32_t,             \
+              uint32_t,             \
+              IdxT*,                \
+              float*,               \
+              rmm::mr::device_memory_resource*);
+
+RAFT_INST_SEARCH(float, uint64_t);
+RAFT_INST_SEARCH(int8_t, uint64_t);
+RAFT_INST_SEARCH(uint8_t, uint64_t);
+
+#undef RAFT_INST_SEARCH
+
+// We define overloads for build and extend with void return type. This is used in the Cython
+// wrappers, where exception handling is not compatible with return type that has nontrivial
+// constructor.
+#define RAFT_INST_BUILD_EXTEND(T, IdxT)      \
+  auto build(const handle_t& handle,         \
+             const index_params& params,     \
+             const T* dataset,               \
+             IdxT n_rows,                    \
+             uint32_t dim)                   \
+    ->index<IdxT>;                           \
+                                             \
+  auto extend(const handle_t& handle,        \
+              const index<IdxT>& orig_index, \
+              const T* new_vectors,          \
+              const IdxT* new_indices,       \
+              IdxT n_rows)                   \
+    ->index<IdxT>;                           \
+                                             \
+  void build(const handle_t& handle,         \
+             const index_params& params,     \
+             const T* dataset,               \
+             IdxT n_rows,                    \
+             uint32_t dim,                   \
+             index<IdxT>* idx);              \
+                                             \
+  void extend(const handle_t& handle,        \
+              index<IdxT>* idx,              \
+              const T* new_vectors,          \
+              const IdxT* new_indices,       \
+              IdxT n_rows);
+
+RAFT_INST_BUILD_EXTEND(float, uint64_t)
+RAFT_INST_BUILD_EXTEND(int8_t, uint64_t)
+RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t)
+
+#undef RAFT_INST_BUILD_EXTEND
+
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 2d19773c3b..5bed71f2f4 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 #include "curand_wrappers.hpp"
-#include "random_types.hpp"
 #include <cmath>
 #include <memory>
 #include <optional>
@@ -26,13 +25,14 @@
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/random/random_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <stdio.h>
 #include <type_traits>
 
-// mvg.cuh takes in matrices that are colomn major (as in fortan)
+// mvg.cuh takes in matrices that are colomn major (as in fortran)
 #define IDX2C(i, j, ld) (j * ld + i)
 
 namespace raft::random {
@@ -170,13 +170,13 @@ class multi_variable_gaussian_impl {
   std::size_t get_workspace_size()
   {
     // malloc workspace_decomp
-    std::size_t granuality = 256, offset = 0;
+    std::size_t granularity = 256, offset = 0;
     workspace_decomp = (T*)offset;
-    offset += raft::alignTo(sizeof(T) * Lwork, granuality);
+    offset += raft::alignTo(sizeof(T) * Lwork, granularity);
     eig = (T*)offset;
-    offset += raft::alignTo(sizeof(T) * dim, granuality);
+    offset += raft::alignTo(sizeof(T) * dim, granularity);
     info = (int*)offset;
-    offset += raft::alignTo(sizeof(int), granuality);
+    offset += raft::alignTo(sizeof(int), granularity);
     return offset;
   }
 
diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh
index 9582f69e34..9dd3912fc4 100644
--- a/cpp/include/raft/random/detail/permute.cuh
+++ b/cpp/include/raft/random/detail/permute.cuh
@@ -35,7 +35,7 @@ __global__ void permuteKernel(
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   // having shuffled input indices and coalesced output indices appears
-  // to be preferrable to the reverse, especially for column major
+  // to be preferable to the reverse, especially for column major
   IntType inIdx  = ((a * int64_t(tid)) + b) % N;
   IntType outIdx = tid;
 
diff --git a/cpp/include/raft/random/detail/random_types.hpp b/cpp/include/raft/random/detail/random_types.hpp
deleted file mode 100644
index 28108f9513..0000000000
--- a/cpp/include/raft/random/detail/random_types.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace raft::random::detail {
-
-enum class multi_variable_gaussian_decomposition_method { CHOLESKY, JACOBI, QR };
-
-};  // end of namespace raft::random::detail
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
index 796a10fb65..6bee323007 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -20,6 +20,7 @@
 #pragma once
 
 #include "detail/multi_variable_gaussian.cuh"
+#include <raft/random/random_types.hpp>
 
 namespace raft::random {
 
@@ -59,24 +60,6 @@ class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
   ~multi_variable_gaussian() { deinit(); }
 };  // end of multi_variable_gaussian
 
-/**
- * @brief Matrix decomposition method for `compute_multi_variable_gaussian` to use.
- *
- * `compute_multi_variable_gaussian` can use any of the following methods.
- *
- * - `CHOLESKY`: Uses Cholesky decomposition on the normal equations.
- *   This may be faster than the other two methods, but less accurate.
- *
- * - `JACOBI`: Uses the singular value decomposition (SVD) computed with
- *   cuSOLVER's gesvdj algorithm, which is based on the Jacobi method
- *   (sweeps of plane rotations).  This exposes more parallelism
- *   for small and medium size matrices than the QR option below.
- *
- * - `QR`: Uses the SVD computed with cuSOLVER's gesvd algorithm,
- *   which is based on the QR algortihm.
- */
-using detail::multi_variable_gaussian_decomposition_method;
-
 template <typename ValueType>
 void compute_multi_variable_gaussian(
   const raft::handle_t& handle,
diff --git a/cpp/include/raft/random/random_types.hpp b/cpp/include/raft/random/random_types.hpp
new file mode 100644
index 0000000000..96b55a4727
--- /dev/null
+++ b/cpp/include/raft/random/random_types.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::random {
+
+/**
+ * @brief Matrix decomposition method for `compute_multi_variable_gaussian` to use.
+ *
+ * `compute_multi_variable_gaussian` can use any of the following methods.
+ *
+ * - `CHOLESKY`: Uses Cholesky decomposition on the normal equations.
+ *   This may be faster than the other two methods, but less accurate.
+ *
+ * - `JACOBI`: Uses the singular value decomposition (SVD) computed with
+ *   cuSOLVER's gesvdj algorithm, which is based on the Jacobi method
+ *   (sweeps of plane rotations).  This exposes more parallelism
+ *   for small and medium size matrices than the QR option below.
+ *
+ * - `QR`: Uses the SVD computed with cuSOLVER's gesvd algorithm,
+ *   which is based on the QR algorithm.
+ */
+enum class multi_variable_gaussian_decomposition_method { CHOLESKY, JACOBI, QR };
+
+};  // end of namespace raft::random
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 8ea985b559..95bfe24a68 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -703,6 +703,30 @@ void laplace(const raft::handle_t& handle,
   detail::laplace(rng_state, ptr, len, mu, scale, handle.get_stream());
 }
 
+namespace sample_without_replacement_impl {
+template <typename T>
+struct weight_alias {
+};
+
+template <>
+struct weight_alias<std::nullopt_t> {
+  using type = double;
+};
+
+template <typename ElementType, typename IndexType>
+struct weight_alias<std::optional<raft::device_vector_view<ElementType, IndexType>>> {
+  using type = typename raft::device_vector_view<ElementType, IndexType>::value_type;
+};
+
+template <typename T>
+using weight_t = typename weight_alias<T>::type;
+}  // namespace sample_without_replacement_impl
+
+/**
+ * \defgroup sample_without_replacement Sampling without Replacement
+ * @{
+ */
+
 /**
  * @brief Sample the input vector without replacement, optionally based on the
  * input weight vector for each element in the array.
@@ -721,7 +745,10 @@ void laplace(const raft::handle_t& handle,
  *
  * @tparam DataT type of each element of the input array @c in
  * @tparam IdxT type of the dimensions of the arrays; output index type
- * @tparam WeightsT type of each elements of the weights array @c wts
+ * @tparam WeightsVectorType std::optional<raft::device_vector_view<const weight_type, IdxT>> of
+ * each elements of the weights array @c weights_opt
+ * @tparam OutIndexVectorType std::optional<raft::device_vector_view<IdxT, IdxT>> of output indices
+ * @c outIdx_opt
  *
  * @note Please do not specify template parameters explicitly,
  *   as the compiler can deduce them from the arguments.
@@ -730,10 +757,10 @@ void laplace(const raft::handle_t& handle,
  *   the CUDA stream on which to run.
  * @param[inout] rng_state Pseudorandom number generator state.
  * @param[in] in Input vector to be sampled.
- * @param[in] wts Optional weights vector.
+ * @param[in] weights_opt std::optional weights vector.
  *        If not provided, uniform sampling will be used.
  * @param[out] out Vector of samples from the input vector.
- * @param[out] outIdx If provided, vector of the indices
+ * @param[out] outIdx_opt std::optional vector of the indices
  *   sampled from the input array.
  *
  * @pre The number of samples `out.extent(0)`
@@ -742,14 +769,22 @@ void laplace(const raft::handle_t& handle,
  * @pre The number of weights `wts.extent(0)`
  *   equals the number of inputs `in.extent(0)`.
  */
-template <typename DataT, typename IdxT, typename WeightsT>
+template <typename DataT, typename IdxT, typename WeightsVectorType, class OutIndexVectorType>
 void sample_without_replacement(const raft::handle_t& handle,
                                 RngState& rng_state,
                                 raft::device_vector_view<const DataT, IdxT> in,
-                                std::optional<raft::device_vector_view<const WeightsT, IdxT>> wts,
+                                WeightsVectorType&& weights_opt,
                                 raft::device_vector_view<DataT, IdxT> out,
-                                std::optional<raft::device_vector_view<IdxT, IdxT>> outIdx)
+                                OutIndexVectorType&& outIdx_opt)
 {
+  using weight_type = sample_without_replacement_impl::weight_t<
+    std::remove_const_t<std::remove_reference_t<WeightsVectorType>>>;
+
+  std::optional<raft::device_vector_view<const weight_type, IdxT>> wts =
+    std::forward<WeightsVectorType>(weights_opt);
+  std::optional<raft::device_vector_view<IdxT, IdxT>> outIdx =
+    std::forward<OutIndexVectorType>(outIdx_opt);
+
   static_assert(std::is_integral<IdxT>::value, "IdxT must be an integral type.");
   const IdxT sampledLen = out.extent(0);
   const IdxT len        = in.extent(0);
@@ -777,7 +812,7 @@ void sample_without_replacement(const raft::handle_t& handle,
                  "sampleWithoutReplacement: "
                  "If wts is provided, its extent(0) must equal in.extent(0)");
   }
-  const WeightsT* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr;
+  const weight_type* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr;
 
   detail::sampleWithoutReplacement(rng_state,
                                    out.data_handle(),
@@ -789,50 +824,22 @@ void sample_without_replacement(const raft::handle_t& handle,
                                    handle.get_stream());
 }
 
-namespace sample_without_replacement_impl {
-template <typename T>
-struct weight_alias {
-};
-
-template <>
-struct weight_alias<std::nullopt_t> {
-  using type = double;
-};
-
-template <typename ElementType, typename IndexType>
-struct weight_alias<std::optional<raft::device_vector_view<ElementType, IndexType>>> {
-  using type = typename raft::device_vector_view<ElementType, IndexType>::value_type;
-};
-
-template <typename T>
-using weight_t = typename weight_alias<T>::type;
-}  // namespace sample_without_replacement_impl
-
 /**
  * @brief Overload of `sample_without_replacement` to help the
  *   compiler find the above overload, in case users pass in
  *   `std::nullopt` for one or both of the optional arguments.
  *
+ *
  * Please see above for documentation of `sample_without_replacement`.
  */
-template <typename DataT, typename IdxT, typename WeightsVectorType, class OutIndexVectorType>
-void sample_without_replacement(const raft::handle_t& handle,
-                                RngState& rng_state,
-                                raft::device_vector_view<const DataT, IdxT> in,
-                                WeightsVectorType&& wts,
-                                raft::device_vector_view<DataT, IdxT> out,
-                                OutIndexVectorType&& outIdx)
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 5>>
+void sample_without_replacement(Args... args)
 {
-  using weight_type = sample_without_replacement_impl::weight_t<
-    std::remove_const_t<std::remove_reference_t<WeightsVectorType>>>;
-  std::optional<raft::device_vector_view<const weight_type, IdxT>> weights =
-    std::forward<WeightsVectorType>(wts);
-  std::optional<raft::device_vector_view<IdxT, IdxT>> output_indices =
-    std::forward<OutIndexVectorType>(outIdx);
-
-  sample_without_replacement(handle, rng_state, in, weights, out, output_indices);
+  sample_without_replacement(std::forward<Args>(args)..., std::nullopt);
 }
 
+/** @} */
+
 /**
  * @brief Legacy version of @c sample_without_replacement (see above)
  *   that takes raw arrays instead of device mdspan.
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index c8e4229203..3bb2db7902 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -19,6 +19,7 @@
 #include <cusparse.h>
 #include <raft/core/cusparse_macros.hpp>
 #include <raft/core/error.hpp>
+#include <raft/linalg/transpose.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
@@ -650,6 +651,73 @@ inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
  * @defgroup Gemmi cusparse gemmi operations
  * @{
  */
+#if CUDART_VERSION < 12000
+template <typename T>
+cusparseStatus_t cusparsegemmi(  // NOLINT
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const T* A,
+  int lda,
+  const T* cscValB,
+  const int* cscColPtrB,
+  const int* cscRowIndB,
+  const T* beta,
+  T* C,
+  int ldc,
+  cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const float* A,
+                                      int lda,
+                                      const float* cscValB,
+                                      const int* cscColPtrB,
+                                      const int* cscRowIndB,
+                                      const float* beta,
+                                      float* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+  return cusparseSgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+#pragma GCC diagnostic pop
+}
+template <>
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const double* A,
+                                      int lda,
+                                      const double* cscValB,
+                                      const int* cscColPtrB,
+                                      const int* cscRowIndB,
+                                      const double* beta,
+                                      double* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+  return cusparseDgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+#pragma GCC diagnostic pop
+}
+#else  // CUDART >= 12.0
 template <typename T>
 cusparseStatus_t cusparsegemmi(  // NOLINT
   cusparseHandle_t handle,
@@ -673,8 +741,9 @@ cusparseStatus_t cusparsegemmi(  // NOLINT
   cusparseDnMatDescr_t matA;
   cusparseSpMatDescr_t matB;
   cusparseDnMatDescr_t matC;
+  rmm::device_uvector<T> CT(m * n, stream);
 
-  auto math_type = std::is_same_v<T, float> ? CUDA_R_32F : CUDA_R_64F;
+  auto constexpr math_type = std::is_same_v<T, float> ? CUDA_R_32F : CUDA_R_64F;
   // Create sparse matrix B
   CUSPARSE_CHECK(cusparseCreateCsc(&matB,
                                    k,
@@ -687,30 +756,38 @@ cusparseStatus_t cusparsegemmi(  // NOLINT
                                    CUSPARSE_INDEX_32I,
                                    CUSPARSE_INDEX_BASE_ZERO,
                                    math_type));
-  // Create dense matrices
+  /**
+   *  Create dense matrices.
+   *  Note: Since this is replacing `cusparse_gemmi`, it assumes dense inputs are
+   *  column-ordered
+   */
   CUSPARSE_CHECK(cusparseCreateDnMat(
-    &matA, m, k, lda, static_cast<void*>(const_cast<T*>(A)), math_type, CUSPARSE_ORDER_ROW));
+    &matA, m, k, lda, static_cast<void*>(const_cast<T*>(A)), math_type, CUSPARSE_ORDER_COL));
   CUSPARSE_CHECK(cusparseCreateDnMat(
-    &matC, m, n, ldc, static_cast<void*>(const_cast<T*>(C)), math_type, CUSPARSE_ORDER_ROW));
+    &matC, n, m, n, static_cast<void*>(CT.data()), math_type, CUSPARSE_ORDER_COL));
 
-  cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
-  cusparseOperation_t opB = CUSPARSE_OPERATION_TRANSPOSE;
-  cusparseSpMMAlg_t alg   = CUSPARSE_SPMM_CSR_ALG2;
-  size_t buffer_size      = 0;
+  auto opA         = CUSPARSE_OPERATION_TRANSPOSE;
+  auto opB         = CUSPARSE_OPERATION_TRANSPOSE;
+  auto alg         = CUSPARSE_SPMM_CSR_ALG1;
+  auto buffer_size = std::size_t{};
 
   CUSPARSE_CHECK(cusparsespmm_bufferSize(
-    handle, opA, opB, alpha, matB, matA, beta, matC, alg, &buffer_size, stream));
+    handle, opB, opA, alpha, matB, matA, beta, matC, alg, &buffer_size, stream));
   buffer_size = buffer_size / sizeof(T);
   rmm::device_uvector<T> external_buffer(buffer_size, stream);
-  auto return_value = cusparsespmm(
-    handle, opA, opB, alpha, matB, matA, beta, matC, alg, external_buffer.data(), stream);
+  auto ext_buf = static_cast<T*>(static_cast<void*>(external_buffer.data()));
+  auto return_value =
+    cusparsespmm(handle, opB, opA, alpha, matB, matA, beta, matC, alg, ext_buf, stream);
 
+  raft::handle_t rhandle;
+  raft::linalg::transpose(rhandle, CT.data(), C, n, m, stream);
   // destroy matrix/vector descriptors
   CUSPARSE_CHECK(cusparseDestroyDnMat(matA));
   CUSPARSE_CHECK(cusparseDestroySpMat(matB));
   CUSPARSE_CHECK(cusparseDestroyDnMat(matC));
   return return_value;
 }
+#endif
 /** @} */
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index fa0031aab6..ae527fe34c 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -13,8 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __TRANSPOSE_H
-#define __TRANSPOSE_H
 
 #pragma once
 
@@ -69,6 +67,4 @@ void csr_transpose(const raft::handle_t& handle,
 
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
-
-#endif
\ No newline at end of file
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index b4d8cb7db9..2b2566f107 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -86,7 +86,7 @@ __global__ void max_duplicates_kernel(const value_idx* src_rows,
  *
  * Note that this function always marks the first value as 0 so that
  * a cumulative sum can be performed as a follow-on. However, even
- * if the mask is used direclty, any duplicates should always have a
+ * if the mask is used directly, any duplicates should always have a
  * 1 when first encountered so it can be assumed that the first element
  * is always a 1 otherwise.
  *
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index cd67e124ee..80b479f98d 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -34,7 +34,7 @@ namespace op {
  *
  * Note that this function always marks the first value as 0 so that
  * a cumulative sum can be performed as a follow-on. However, even
- * if the mask is used direclty, any duplicates should always have a
+ * if the mask is used directly, any duplicates should always have a
  * 1 when first encountered so it can be assumed that the first element
  * is always a 1 otherwise.
  *
diff --git a/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
index be8b696bca..d68d9f68b0 100644
--- a/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
@@ -220,7 +220,7 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration()
   auto nthreads = std::min(v, max_threads);
   auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
-  // maximum alteration that does not change realtive weights order
+  // maximum alteration that does not change relative weights order
   alteration_t max = alteration_max();
 
   // pool of rand values
diff --git a/cpp/include/raft/sparse/solver/mst.cuh b/cpp/include/raft/sparse/solver/mst.cuh
index 5f55a567ca..a941ce7c80 100644
--- a/cpp/include/raft/sparse/solver/mst.cuh
+++ b/cpp/include/raft/sparse/solver/mst.cuh
@@ -21,7 +21,7 @@
 namespace raft::sparse::solver {
 
 /**
- * Compute the minimium spanning tree (MST) or minimum spanning forest (MSF) depending on
+ * Compute the minimum spanning tree (MST) or minimum spanning forest (MSF) depending on
  * the connected components of the given graph.
  *
  * @tparam vertex_t integral type for precision of vertex indexing
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index b766e12cbd..961cc76381 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -32,6 +32,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/normalize.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/argmin.cuh>
 #include <raft/matrix/matrix.cuh>
@@ -464,7 +465,7 @@ __global__ void __launch_bounds__((WarpSize * BlockDimY))
   // a sample from the selected larger cluster.
   const IdxT li = static_cast<IdxT>(labels[i]);
   // Weight of the current center for the weighted average.
-  // We dump it for anomalously small clusters, but keep constant overwise.
+  // We dump it for anomalously small clusters, but keep constant otherwise.
   const float wc = csize > kAdjustCentersWeight ? kAdjustCentersWeight : float(csize);
   // Weight for the datapoint used to shift the center.
   const float wd = 1.0;
@@ -568,7 +569,7 @@ auto adjust_centers(float* centers,
         // a sample from the selected larger cluster.
         const IdxT li = static_cast<IdxT>(labels[i]);
         // Weight of the current center for the weighted average.
-        // We dump it for anomalously small clusters, but keep constant overwise.
+        // We dump it for anomalously small clusters, but keep constant otherwise.
         const float wc = std::min<float>(csize, kAdjustCentersWeight);
         // Weight for the datapoint used to shift the center.
         const float wd = 1.0;
@@ -663,8 +664,16 @@ void balancing_em_iters(const handle_t& handle,
       // To avoid converging to zero, we normalize the center vectors on every iteration.
       case raft::distance::DistanceType::InnerProduct:
       case raft::distance::DistanceType::CosineExpanded:
-      case raft::distance::DistanceType::CorrelationExpanded:
-        utils::normalize_rows<uint32_t>(n_clusters, dim, cluster_centers, stream);
+      case raft::distance::DistanceType::CorrelationExpanded: {
+        auto clusters_in_view =
+          raft::make_device_matrix_view<const float, uint32_t, raft::row_major>(
+            cluster_centers, n_clusters, dim);
+        auto clusters_out_view = raft::make_device_matrix_view<float, uint32_t, raft::row_major>(
+          cluster_centers, n_clusters, dim);
+        raft::linalg::row_normalize(
+          handle, clusters_in_view, clusters_out_view, raft::linalg::L2Norm);
+        break;
+      }
       default: break;
     }
     // E: Expectation step - predict labels
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 5d031cc51d..b721915187 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -54,8 +54,9 @@ struct pointer_residency_count<Type, Types...> {
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr));
     switch (attr.type) {
-      case cudaMemoryTypeUnregistered:
-      case cudaMemoryTypeHost: return std::make_tuple(on_device, on_host + 1);
+      case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1);
+      case cudaMemoryTypeHost:
+        return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1);
       case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host);
       case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1);
       default: return std::make_tuple(on_device, on_host);
@@ -75,6 +76,58 @@ auto check_pointer_residency(const Types*... ptrs) -> pointer_residency
   return pointer_residency::mixed;
 }
 
+/** RAII helper to access the host data from gpu when necessary. */
+template <typename PtrT, typename Action>
+struct with_mapped_memory_t {
+  with_mapped_memory_t(PtrT ptr, size_t size, Action action) : action_(action)
+  {
+    if (ptr == nullptr) { return; }
+    switch (utils::check_pointer_residency(ptr)) {
+      case utils::pointer_residency::device_only:
+      case utils::pointer_residency::host_and_device: {
+        dev_ptr_ = (void*)ptr;  // NOLINT
+      } break;
+      default: {
+        host_ptr_ = (void*)ptr;  // NOLINT
+        RAFT_CUDA_TRY(cudaHostRegister(host_ptr_, size, choose_flags(ptr)));
+        RAFT_CUDA_TRY(cudaHostGetDevicePointer(&dev_ptr_, host_ptr_, 0));
+      } break;
+    }
+  }
+
+  ~with_mapped_memory_t()
+  {
+    if (host_ptr_ != nullptr) { cudaHostUnregister(host_ptr_); }
+  }
+
+  auto operator()() { return action_((PtrT)dev_ptr_); }  // NOLINT
+
+ private:
+  Action action_;
+  void* host_ptr_ = nullptr;
+  void* dev_ptr_  = nullptr;
+
+  template <typename T>
+  static auto choose_flags(const T*) -> unsigned int
+  {
+    int dev_id, readonly_supported;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+    RAFT_CUDA_TRY(cudaDeviceGetAttribute(
+      &readonly_supported, cudaDevAttrHostRegisterReadOnlySupported, dev_id));
+    if (readonly_supported) {
+      return cudaHostRegisterMapped | cudaHostRegisterReadOnly;
+    } else {
+      return cudaHostRegisterMapped;
+    }
+  }
+
+  template <typename T>
+  static auto choose_flags(T*) -> unsigned int
+  {
+    return cudaHostRegisterMapped;
+  }
+};
+
 template <typename T>
 struct config {
 };
@@ -151,97 +204,6 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream)
   }
 }
 
-template <typename IdxT>
-__global__ void dots_along_rows_kernel(IdxT n_rows, IdxT n_cols, const float* a, float* out)
-{
-  IdxT i = threadIdx.y + (blockDim.y * static_cast<IdxT>(blockIdx.x));
-  if (i >= n_rows) return;
-
-  float sqsum = 0.0;
-  for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
-    float val = a[j + (n_cols * i)];
-    sqsum += val * val;
-  }
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
-  if (threadIdx.x == 0) { out[i] = sqsum; }
-}
-
-/**
- * @brief Square sum of values in each row (row-major matrix).
- *
- * NB: device-only function
- *
- * @tparam IdxT index type
- *
- * @param n_rows
- * @param n_cols
- * @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
- * @param[out] out device pointer to the vector of dot-products [n_rows]
- * @param stream
- */
-template <typename IdxT>
-inline void dots_along_rows(
-  IdxT n_rows, IdxT n_cols, const float* a, float* out, rmm::cuda_stream_view stream)
-{
-  dim3 threads(32, 4, 1);
-  dim3 blocks(ceildiv<IdxT>(n_rows, threads.y), 1, 1);
-  dots_along_rows_kernel<IdxT><<<blocks, threads, 0, stream>>>(n_rows, n_cols, a, out);
-  /**
-   * TODO: this can be replaced with the rowNorm helper as shown below.
-   * However, the rowNorm helper seems to incur a significant performance penalty
-   * (example case ann-search slowed down from 150ms to 186ms).
-   *
-   * raft::linalg::rowNorm(out, a, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
-   */
-}
-
-template <typename IdxT>
-__global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a)
-{
-  IdxT i = threadIdx.y + (blockDim.y * static_cast<IdxT>(blockIdx.x));
-  if (i >= n_rows) return;
-
-  float sqsum = 0.0;
-  for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
-    float val = a[j + (n_cols * i)];
-    sqsum += val * val;
-  }
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
-  if (sqsum <= 1e-8) return;
-  sqsum = rsqrtf(sqsum);  // reciprocal of the square root
-  for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
-    a[j + n_cols * i] *= sqsum;
-  }
-}
-
-/**
- * @brief Divide rows by their L2 norm (square root of sum of squares).
- *
- * NB: device-only function
- *
- * @tparam IdxT index type
- *
- * @param[in] n_rows
- * @param[in] n_cols
- * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
- * @param stream
- */
-template <typename IdxT>
-inline void normalize_rows(IdxT n_rows, IdxT n_cols, float* a, rmm::cuda_stream_view stream)
-{
-  dim3 threads(32, 4, 1);  // DO NOT CHANGE
-  dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
-  normalize_rows_kernel<IdxT><<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
-}
-
 template <typename T, typename IdxT>
 __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
 {
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 94897daa22..32a8f0ed33 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/core/handle.hpp>
 
-#include "../ball_cover_common.h"
+#include "../ball_cover_types.hpp"
 #include "ball_cover/common.cuh"
 #include "ball_cover/registers.cuh"
 #include "block_select_faiss.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 112ab9f13c..9c5307e683 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -18,7 +18,7 @@
 
 #include "common.cuh"
 
-#include "../../ball_cover_common.h"
+#include "../../ball_cover_types.hpp"
 #include "../block_select_faiss.cuh"
 #include "../haversine_distance.cuh"
 #include "../selection_faiss.cuh"
@@ -791,4 +791,4 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index e16efe4a69..19862d743d 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/device_utils.cuh>
 #include <raft/linalg/contractions.cuh>
+#include <raft/util/device_utils.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index b5ae9e7d5e..5c03f8f67c 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -27,7 +27,6 @@
 
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index 14f5ae4516..14c4dd85f1 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -24,6 +24,10 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/stats/histogram.cuh>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -47,11 +51,13 @@ using namespace raft::spatial::knn::detail;  // NOLINT
  * @tparam T      element type.
  * @tparam IdxT   type of the indices in the source source_vecs
  * @tparam LabelT label type
+ * @tparam gather_src if false, then we build the index from vectors source_vecs[i,:], otherwise
+ *     we use source_vecs[source_ixs[i],:]. In both cases i=0..n_rows-1.
  *
  * @param[in] labels device pointer to the cluster ids for each row [n_rows]
  * @param[in] list_offsets device pointer to the cluster offsets in the output (index) [n_lists]
- * @param[in] source_vecs device poitner to the input data [n_rows, dim]
- * @param[in] source_ixs device poitner to the input indices [n_rows]
+ * @param[in] source_vecs device pointer to the input data [n_rows, dim]
+ * @param[in] source_ixs device pointer to the input indices [n_rows]
  * @param[out] list_data device pointer to the output [index_size, dim]
  * @param[out] list_index device pointer to the source ids corr. to the output [index_size]
  * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
@@ -61,7 +67,7 @@ using namespace raft::spatial::knn::detail;  // NOLINT
  * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
  *
  */
-template <typename T, typename IdxT, typename LabelT>
+template <typename T, typename IdxT, typename LabelT, bool gather_src = false>
 __global__ void build_index_kernel(const LabelT* labels,
                                    const IdxT* list_offsets,
                                    const T* source_vecs,
@@ -92,8 +98,11 @@ __global__ void build_index_kernel(const LabelT* labels,
   list_data += (list_offset + group_offset) * dim;
 
   // Point to the source vector
-  source_vecs += i * dim;
-
+  if constexpr (gather_src) {
+    source_vecs += source_ixs[i] * dim;
+  } else {
+    source_vecs += i * dim;
+  }
   // Interleave dimensions of the source vector while recording it.
   // NB: such `veclen` is selected, that `dim % veclen == 0`
   for (uint32_t l = 0; l < dim; l += veclen) {
@@ -133,7 +142,8 @@ inline auto extend(const handle_t& handle,
                                    orig_index.metric(),
                                    stream);
 
-  index<T, IdxT> ext_index(handle, orig_index.metric(), n_lists, dim);
+  index<T, IdxT> ext_index(
+    handle, orig_index.metric(), n_lists, orig_index.adaptive_centers(), dim);
 
   auto list_sizes_ptr   = ext_index.list_sizes().data_handle();
   auto list_offsets_ptr = ext_index.list_offsets().data_handle();
@@ -141,19 +151,31 @@ inline auto extend(const handle_t& handle,
 
   // Calculate the centers and sizes on the new data, starting from the original values
   raft::copy(centers_ptr, orig_index.centers().data_handle(), ext_index.centers().size(), stream);
-  raft::copy(
-    list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream);
 
-  kmeans::calc_centers_and_sizes(handle,
-                                 centers_ptr,
-                                 list_sizes_ptr,
-                                 n_lists,
-                                 dim,
-                                 new_vectors,
-                                 n_rows,
-                                 new_labels.data(),
-                                 false,
-                                 stream);
+  if (ext_index.adaptive_centers()) {
+    raft::copy(
+      list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream);
+    kmeans::calc_centers_and_sizes(handle,
+                                   centers_ptr,
+                                   list_sizes_ptr,
+                                   n_lists,
+                                   dim,
+                                   new_vectors,
+                                   n_rows,
+                                   new_labels.data(),
+                                   false,
+                                   stream);
+  } else {
+    raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(list_sizes_ptr),
+                                           IdxT(n_lists),
+                                           new_labels.data(),
+                                           n_rows,
+                                           1,
+                                           stream);
+    raft::linalg::add(
+      list_sizes_ptr, list_sizes_ptr, orig_index.list_sizes().data_handle(), n_lists, stream);
+  }
 
   // Calculate new offsets
   IdxT index_size = 0;
@@ -210,13 +232,22 @@ inline auto extend(const handle_t& handle,
 
   // Precompute the centers vector norms for L2Expanded distance
   if (ext_index.center_norms().has_value()) {
-    // todo(lsugy): use other prim and remove this one
-    utils::dots_along_rows(n_lists,
-                           dim,
-                           ext_index.centers().data_handle(),
-                           ext_index.center_norms()->data_handle(),
-                           stream);
-    RAFT_LOG_TRACE_VEC(ext_index.center_norms()->data_handle(), std::min<uint32_t>(dim, 20));
+    if (!ext_index.adaptive_centers() && orig_index.center_norms().has_value()) {
+      raft::copy(ext_index.center_norms()->data_handle(),
+                 orig_index.center_norms()->data_handle(),
+                 orig_index.center_norms()->size(),
+                 stream);
+    } else {
+      raft::linalg::rowNorm(ext_index.center_norms()->data_handle(),
+                            ext_index.centers().data_handle(),
+                            dim,
+                            n_lists,
+                            raft::linalg::L2Norm,
+                            true,
+                            stream,
+                            raft::SqrtOp<float>());
+      RAFT_LOG_TRACE_VEC(ext_index.center_norms()->data_handle(), std::min<uint32_t>(dim, 20));
+    }
   }
 
   // assemble the index
@@ -274,4 +305,76 @@ inline auto build(
   }
 }
 
+/**
+ * Build an index that can be used in refinement operation.
+ *
+ * See raft::neighbors::refine for details on the refinement operation.
+ *
+ * The returned index cannot be used for a regular ivf_flat::search. The index misses information
+ * about coarse clusters. Instead, the neighbor candidates are assumed to form clusters, one for
+ * each query. The candidate vectors are gathered into the index dataset, that can be later used
+ * in ivfflat_interleaved_scan.
+ *
+ * @param[in] handle the raft handle
+ * @param[inout] refinement_index
+ * @param[in] dataset device pointer to dataset vectors, size [n_rows, dim]. Note that n_rows is
+ *   not known to this function, but each candidate_idx has to be smaller than n_rows.
+ * @param[in] candidate_idx device pointer to neighbor candidates, size [n_queries, n_candidates]
+ * @param[in] n_candidates  of neighbor_candidates
+ */
+template <typename T, typename IdxT>
+inline void fill_refinement_index(const handle_t& handle,
+                                  index<T, IdxT>* refinement_index,
+                                  const T* dataset,
+                                  const IdxT* candidate_idx,
+                                  IdxT n_queries,
+                                  uint32_t n_candidates)
+{
+  using LabelT = uint32_t;
+
+  auto stream      = handle.get_stream();
+  uint32_t n_lists = n_queries;
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));
+
+  rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
+  linalg::writeOnlyUnaryOp(
+    new_labels.data(),
+    n_queries * n_candidates,
+    [n_candidates] __device__(LabelT * out, uint32_t i) { *out = i / n_candidates; },
+    stream);
+
+  auto list_sizes_ptr   = refinement_index->list_sizes().data_handle();
+  auto list_offsets_ptr = refinement_index->list_offsets().data_handle();
+  // We do not fill centers and center norms, since we will not run coarse search.
+
+  // Calculate new offsets
+  uint32_t n_roundup = Pow2<kIndexGroupSize>::roundUp(n_candidates);
+  linalg::writeOnlyUnaryOp(
+    refinement_index->list_offsets().data_handle(),
+    refinement_index->list_offsets().size(),
+    [n_roundup] __device__(IdxT * out, uint32_t i) { *out = i * n_roundup; },
+    stream);
+
+  IdxT index_size = n_roundup * n_lists;
+  refinement_index->allocate(
+    handle, index_size, refinement_index->metric() == raft::distance::DistanceType::L2Expanded);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(list_sizes_ptr, 0, n_lists * sizeof(uint32_t), stream));
+
+  const dim3 block_dim(256);
+  const dim3 grid_dim(raft::ceildiv<IdxT>(n_queries * n_candidates, block_dim.x));
+  build_index_kernel<T, IdxT, LabelT, true>
+    <<<grid_dim, block_dim, 0, stream>>>(new_labels.data(),
+                                         list_offsets_ptr,
+                                         dataset,
+                                         candidate_idx,
+                                         refinement_index->data().data_handle(),
+                                         refinement_index->indices().data_handle(),
+                                         list_sizes_ptr,
+                                         n_queries * n_candidates,
+                                         refinement_index->dim(),
+                                         refinement_index->veclen());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
 }  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index 5b3b2129f7..94f4dc96c6 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -27,6 +27,7 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/norm.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_loads_stores.cuh>
 #include <raft/util/pow2_utils.cuh>
@@ -980,7 +981,7 @@ struct select_interleaved_scan_kernel {
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
-    // NB: this is the limitation of the topk::block_topk stuctures that use a huge number of
+    // NB: this is the limitation of the topk::block_topk structures that use a huge number of
     //     registers (used in the main kernel here).
     RAFT_EXPECTS(capacity == Capacity,
                  "Capacity must be power-of-two not bigger than the maximum allowed size "
@@ -1102,8 +1103,14 @@ void search_impl(const handle_t& handle,
   if (index.metric() == raft::distance::DistanceType::L2Expanded) {
     alpha = -2.0f;
     beta  = 1.0f;
-    utils::dots_along_rows(
-      n_queries, index.dim(), converted_queries_ptr, query_norm_dev.data(), stream);
+    raft::linalg::rowNorm(query_norm_dev.data(),
+                          converted_queries_ptr,
+                          static_cast<IdxT>(index.dim()),
+                          static_cast<IdxT>(n_queries),
+                          raft::linalg::L2Norm,
+                          true,
+                          stream,
+                          raft::SqrtOp<float>());
     utils::outer_add(query_norm_dev.data(),
                      (IdxT)n_queries,
                      index.center_norms()->data_handle(),
@@ -1205,6 +1212,26 @@ void search_impl(const handle_t& handle,
   }
 }
 
+/**
+ * Whether minimal distance corresponds to similar elements (using the given metric).
+ */
+inline bool is_min_close(distance::DistanceType metric)
+{
+  bool select_min;
+  switch (metric) {
+    case raft::distance::DistanceType::InnerProduct:
+    case raft::distance::DistanceType::CosineExpanded:
+    case raft::distance::DistanceType::CorrelationExpanded:
+      // Similarity metrics have the opposite meaning, i.e. nearest neighbors are those with larger
+      // similarity (See the same logic at cpp/include/raft/sparse/spatial/detail/knn.cuh:362
+      // {perform_k_selection})
+      select_min = false;
+      break;
+    default: select_min = true;
+  }
+  return select_min;
+}
+
 /** See raft::spatial::knn::ivf_flat::search docs */
 template <typename T, typename IdxT>
 inline void search(const handle_t& handle,
@@ -1224,27 +1251,22 @@ inline void search(const handle_t& handle,
                "n_probes (number of clusters to probe in the search) must be positive.");
   auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
 
-  bool select_min;
-  switch (index.metric()) {
-    case raft::distance::DistanceType::InnerProduct:
-    case raft::distance::DistanceType::CosineExpanded:
-    case raft::distance::DistanceType::CorrelationExpanded:
-      // Similarity metrics have the opposite meaning, i.e. nearest neigbours are those with larger
-      // similarity (See the same logic at cpp/include/raft/sparse/spatial/detail/knn.cuh:362
-      // {perform_k_selection})
-      select_min = false;
-      break;
-    default: select_min = true;
-  }
-
   auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
   if (pool_guard) {
     RAFT_LOG_DEBUG("ivf_flat::search: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
 
-  return search_impl<T, float, IdxT>(
-    handle, index, queries, n_queries, k, n_probes, select_min, neighbors, distances, mr);
+  return search_impl<T, float, IdxT>(handle,
+                                     index,
+                                     queries,
+                                     n_queries,
+                                     k,
+                                     n_probes,
+                                     is_min_close(index.metric()),
+                                     neighbors,
+                                     distances,
+                                     mr);
 }
 
 }  // namespace raft::spatial::knn::ivf_flat::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index 0577d24349..9262ef6baf 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -16,17 +16,20 @@
 
 #pragma once
 
-#include "../ivf_pq_types.hpp"
 #include "ann_kmeans_balanced.cuh"
 #include "ann_utils.cuh"
 
+#include <raft/neighbors/ivf_pq_types.hpp>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/qr.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
@@ -50,6 +53,14 @@ namespace raft::spatial::knn::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
+using raft::neighbors::ivf_pq::codebook_gen;
+using raft::neighbors::ivf_pq::index;
+using raft::neighbors::ivf_pq::index_params;
+using raft::neighbors::ivf_pq::kIndexGroupSize;
+using raft::neighbors::ivf_pq::kIndexGroupVecLen;
+
+using pq_codes_exts = extents<size_t, dynamic_extent, dynamic_extent, kIndexGroupVecLen>;
+
 namespace {
 
 /**
@@ -108,55 +119,72 @@ struct bitfield_view_t {
   NB: label type is uint32_t although it can only contain values up to `1 << pq_bits`.
       We keep it this way to not force one more overload for kmeans::predict.
  */
-template <uint32_t PqBits>
-HDI void ivfpq_encode_core(uint32_t n_rows, uint32_t pq_dim, const uint32_t* label, uint8_t* output)
+template <uint32_t PqBits, size_t VecLen>
+__device__ void ivfpq_encode_core(uint32_t n_rows,
+                                  uint32_t pq_dim,
+                                  const uint32_t* label,
+                                  uint8_t* output)
 {
-  bitfield_view_t<PqBits> out{output};
-  for (uint32_t j = 0; j < pq_dim; j++, label += n_rows) {
-    out[j] = static_cast<uint8_t>(*label);
+  constexpr uint32_t kChunkSize = (VecLen * 8u) / PqBits;
+  TxN_t<uint8_t, VecLen> vec;
+  for (uint32_t j = 0; j < pq_dim;) {
+    vec.fill(0);
+    bitfield_view_t<PqBits> out{vec.val.data};
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++, label += n_rows) {
+      out[k] = static_cast<uint8_t>(*label);
+    }
+    vec.store(output, 0);
+    output += VecLen;
   }
 }
 
 template <uint32_t BlockDim, uint32_t PqBits>
 __launch_bounds__(BlockDim) __global__
-  void ivfpq_encode_kernel(uint32_t n_rows,
-                           uint32_t pq_dim,
+  void ivfpq_encode_kernel(uint32_t pq_dim,
                            const uint32_t* label,  // [pq_dim, n_rows]
-                           uint8_t* output         // [n_rows, pq_dim]
+                           device_mdspan<uint8_t, pq_codes_exts, row_major> output  // [n_rows, ..]
   )
 {
   uint32_t i = threadIdx.x + BlockDim * blockIdx.x;
-  if (i >= n_rows) return;
-  ivfpq_encode_core<PqBits>(n_rows, pq_dim, label + i, output + (pq_dim * PqBits / 8) * i);
+  if (i >= output.extent(0)) return;
+  ivfpq_encode_core<PqBits, output.static_extent(2)>(
+    output.extent(0),
+    pq_dim,
+    label + i,
+    output.data_handle() + output.extent(1) * output.extent(2) * i);
 }
 }  // namespace
 
-inline void ivfpq_encode(uint32_t n_rows,
-                         uint32_t pq_dim,
+/**
+ * Compress the cluster labels into an encoding with pq_bits bits, and transform it into a form to
+ * facilitate vectorized loads
+ */
+inline void ivfpq_encode(uint32_t pq_dim,
                          uint32_t pq_bits,       // 4 <= pq_bits <= 8
                          const uint32_t* label,  // [pq_dim, n_rows]
-                         uint8_t* output,        // [n_rows, pq_dim]
+                         device_mdspan<uint8_t, pq_codes_exts, row_major> output,  // [n_rows, ..]
                          rmm::cuda_stream_view stream)
 {
   constexpr uint32_t kBlockDim = 128;
   dim3 threads(kBlockDim, 1, 1);
-  dim3 blocks(raft::ceildiv<uint32_t>(n_rows, kBlockDim), 1, 1);
+  dim3 blocks(raft::ceildiv<uint32_t>(output.extent(0), kBlockDim), 1, 1);
   switch (pq_bits) {
     case 4:
       return ivfpq_encode_kernel<kBlockDim, 4>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 5:
       return ivfpq_encode_kernel<kBlockDim, 5>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 6:
       return ivfpq_encode_kernel<kBlockDim, 6>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 7:
       return ivfpq_encode_kernel<kBlockDim, 7>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     case 8:
       return ivfpq_encode_kernel<kBlockDim, 8>
-        <<<blocks, threads, 0, stream>>>(n_rows, pq_dim, label, output);
+        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
     default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
   }
 }
@@ -226,7 +254,7 @@ void select_residuals(const handle_t& handle,
 )
 {
   auto stream = handle.get_stream();
-  rmm::device_uvector<float> tmp(n_rows * dim, stream, device_memory);
+  rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
   utils::copy_selected<float, T>(
     n_rows, (IdxT)dim, dataset, row_ids, (IdxT)dim, tmp.data(), (IdxT)dim, stream);
 
@@ -260,6 +288,7 @@ void select_residuals(const handle_t& handle,
 }
 
 /**
+ *
  * @param handle,
  * @param n_rows
  * @param data_dim
@@ -278,30 +307,35 @@ void select_residuals(const handle_t& handle,
  *    it should be partitioned by the clusters by now.
  * @param cluster_sizes    // [n_clusters]
  * @param cluster_offsets  // [n_clusters + 1]
- * @param pq_centers                 // [...]
- * @param pq_dataset  // [n_rows, pq_dim * pq_bits / 8]
+ * @param pq_centers  // [...] (see ivf_pq::index::pq_centers() layout)
+ * @param pq_dataset
+ *   // [n_rows, ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits), kIndexGroupVecLen]
+ *   NB: in contrast to the final interleaved layout in ivf_pq::index::pq_dataset(), this function
+ *       produces a non-interleaved data; it gets interleaved later when adding the data to the
+ *       index.
  * @param device_memory
  */
 template <typename T, typename IdxT>
-void compute_pq_codes(const handle_t& handle,
-                      IdxT n_rows,
-                      uint32_t data_dim,
-                      uint32_t rot_dim,
-                      uint32_t pq_dim,
-                      uint32_t pq_len,
-                      uint32_t pq_bits,
-                      uint32_t n_clusters,
-                      codebook_gen codebook_kind,
-                      uint32_t max_cluster_size,
-                      float* cluster_centers,
-                      const float* rotation_matrix,
-                      const T* dataset,
-                      const IdxT* data_indices,
-                      const uint32_t* cluster_sizes,
-                      const IdxT* cluster_offsets,
-                      const float* pq_centers,
-                      uint8_t* pq_dataset,
-                      rmm::mr::device_memory_resource* device_memory)
+void compute_pq_codes(
+  const handle_t& handle,
+  IdxT n_rows,
+  uint32_t data_dim,
+  uint32_t rot_dim,
+  uint32_t pq_dim,
+  uint32_t pq_len,
+  uint32_t pq_bits,
+  uint32_t n_clusters,
+  codebook_gen codebook_kind,
+  uint32_t max_cluster_size,
+  float* cluster_centers,
+  const float* rotation_matrix,
+  const T* dataset,
+  const IdxT* data_indices,
+  const uint32_t* cluster_sizes,
+  const IdxT* cluster_offsets,
+  device_mdspan<const float, typename index<IdxT>::pq_centers_extents, row_major> pq_centers,
+  device_mdspan<uint8_t, pq_codes_exts, row_major> pq_dataset,
+  rmm::mr::device_memory_resource* device_memory)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::compute_pq_codes(n_rows = %zu, data_dim = %u, rot_dim = %u (%u * %u), n_clusters = "
@@ -317,15 +351,15 @@ void compute_pq_codes(const handle_t& handle,
   //
   // Compute PQ code
   //
-  utils::memzero(pq_dataset, n_rows * pq_dim * pq_bits / 8, stream);
 
-  rmm::device_uvector<float> rot_vectors(max_cluster_size * rot_dim, stream, device_memory);
-  rmm::device_uvector<float> sub_vectors(max_cluster_size * pq_dim * pq_len, stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_vector_labels(max_cluster_size * pq_dim, stream, device_memory);
-  rmm::device_uvector<uint8_t> my_pq_dataset(
-    max_cluster_size * pq_dim * pq_bits / 8 /* NB: pq_dim * bitPQ % 8 == 0 */,
-    stream,
-    device_memory);
+  uint32_t pq_width = 1 << pq_bits;
+  rmm::device_uvector<float> pq_centers_tmp(pq_len * pq_width, stream, device_memory);
+  rmm::device_uvector<float> rot_vectors(
+    size_t(max_cluster_size) * size_t(rot_dim), stream, device_memory);
+  rmm::device_uvector<float> sub_vectors(
+    size_t(max_cluster_size) * size_t(pq_dim * pq_len), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_vector_labels(
+    size_t(max_cluster_size) * size_t(pq_dim), stream, device_memory);
 
   for (uint32_t l = 0; l < n_clusters; l++) {
     auto cluster_size = cluster_sizes[l];
@@ -339,7 +373,7 @@ void compute_pq_codes(const handle_t& handle,
                      data_dim,
                      rot_dim,
                      rotation_matrix,
-                     cluster_centers + uint64_t(l) * data_dim,
+                     cluster_centers + size_t(l) * size_t(data_dim),
                      dataset,
                      data_indices + cluster_offsets[l],
                      device_memory);
@@ -351,37 +385,51 @@ void compute_pq_codes(const handle_t& handle,
     //   output: sub_vectors[pq_dim, cluster_size, pq_len]
     //
     for (uint32_t i = 0; i < pq_dim; i++) {
-      RAFT_CUDA_TRY(cudaMemcpy2DAsync(sub_vectors.data() + i * pq_len * cluster_size,
-                                      sizeof(float) * pq_len,
-                                      rot_vectors.data() + i * pq_len,
-                                      sizeof(float) * rot_dim,
-                                      sizeof(float) * pq_len,
-                                      cluster_size,
-                                      cudaMemcpyDefault,
-                                      stream));
+      RAFT_CUDA_TRY(
+        cudaMemcpy2DAsync(sub_vectors.data() + size_t(i) * size_t(pq_len) * size_t(cluster_size),
+                          sizeof(float) * pq_len,
+                          rot_vectors.data() + i * pq_len,
+                          sizeof(float) * rot_dim,
+                          sizeof(float) * pq_len,
+                          cluster_size,
+                          cudaMemcpyDefault,
+                          stream));
+    }
+
+    if (codebook_kind == codebook_gen::PER_CLUSTER) {
+      linalg::writeOnlyUnaryOp(
+        pq_centers_tmp.data(),
+        pq_len * pq_width,
+        [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) {
+          auto i0 = i / pq_len;
+          auto i1 = i % pq_len;
+          *out    = pq_centers(l, i1, i0);
+        },
+        stream);
     }
 
     //
     // Find a label (cluster ID) for each vector subspace.
     //
     for (uint32_t j = 0; j < pq_dim; j++) {
-      const float* sub_pq_centers = nullptr;
-      switch (codebook_kind) {
-        case codebook_gen::PER_SUBSPACE:
-          sub_pq_centers = pq_centers + ((1 << pq_bits) * pq_len) * j;
-          break;
-        case codebook_gen::PER_CLUSTER:
-          sub_pq_centers = pq_centers + ((1 << pq_bits) * pq_len) * l;
-          break;
-        default: RAFT_FAIL("Unreachable code");
+      if (codebook_kind == codebook_gen::PER_SUBSPACE) {
+        linalg::writeOnlyUnaryOp(
+          pq_centers_tmp.data(),
+          pq_len * pq_width,
+          [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) {
+            auto i0 = i / pq_len;
+            auto i1 = i % pq_len;
+            *out    = pq_centers(j, i1, i0);
+          },
+          stream);
       }
       kmeans::predict(handle,
-                      sub_pq_centers,
-                      (1 << pq_bits),
+                      pq_centers_tmp.data(),
+                      pq_width,
                       pq_len,
-                      sub_vectors.data() + j * (cluster_size * pq_len),
+                      sub_vectors.data() + size_t(j) * size_t(cluster_size) * size_t(pq_len),
                       cluster_size,
-                      sub_vector_labels.data() + j * cluster_size,
+                      sub_vector_labels.data() + size_t(j) * size_t(cluster_size),
                       raft::distance::DistanceType::L2Expanded,
                       stream,
                       device_memory);
@@ -391,11 +439,14 @@ void compute_pq_codes(const handle_t& handle,
     // PQ encoding
     //
     ivfpq_encode(
-      cluster_size, pq_dim, pq_bits, sub_vector_labels.data(), my_pq_dataset.data(), stream);
-    copy(pq_dataset + cluster_offsets[l] * uint64_t{pq_dim * pq_bits / 8},
-         my_pq_dataset.data(),
-         cluster_size * pq_dim * pq_bits / 8,
-         stream);
+      pq_dim,
+      pq_bits,
+      sub_vector_labels.data(),
+      make_mdspan<uint8_t, IdxT, row_major, false, true>(
+        pq_dataset.data_handle() +
+          size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2),
+        make_extents<IdxT>(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))),
+      stream);
   }
 }
 
@@ -405,7 +456,7 @@ __launch_bounds__(BlockDim) __global__ void fill_indices_kernel(IdxT n_rows,
                                                                 IdxT* data_offsets,
                                                                 const uint32_t* labels)
 {
-  const auto i = BlockDim * IdxT(blockIdx.x) + IdxT(threadIdx.x);
+  const auto i = IdxT(BlockDim) * IdxT(blockIdx.x) + IdxT(threadIdx.x);
   if (i >= n_rows) { return; }
   data_indices[atomicAdd<IdxT>(data_offsets + labels[i], 1)] = i;
 }
@@ -453,10 +504,36 @@ auto calculate_offsets_and_indices(IdxT n_rows,
   return max_cluster_size;
 }
 
+template <typename IdxT>
+void transpose_pq_centers(index<IdxT>& index,
+                          const float* pq_centers_source,
+                          rmm::cuda_stream_view stream)
+{
+  auto extents = index.pq_centers().extents();
+  static_assert(extents.rank() == 3);
+  auto extents_source =
+    make_extents<uint32_t>(extents.extent(0), extents.extent(2), extents.extent(1));
+  auto span_source =
+    make_mdspan<const float, uint32_t, row_major, false, true>(pq_centers_source, extents_source);
+  linalg::writeOnlyUnaryOp(
+    index.pq_centers().data_handle(),
+    index.pq_centers().size(),
+    [span_source, extents] __device__(float* out, size_t i) {
+      uint32_t ii[3];
+      for (int r = 2; r > 0; r--) {
+        ii[r] = i % extents.extent(r);
+        i /= extents.extent(r);
+      }
+      ii[0] = i;
+      *out  = span_source(ii[0], ii[2], ii[1]);
+    },
+    stream);
+}
+
 template <typename IdxT>
 void train_per_subset(const handle_t& handle,
                       index<IdxT>& index,
-                      IdxT n_rows,
+                      size_t n_rows,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
@@ -465,7 +542,8 @@ void train_per_subset(const handle_t& handle,
 {
   auto stream = handle.get_stream();
 
-  rmm::device_uvector<float> sub_trainset(n_rows * index.pq_len(), stream, device_memory);
+  rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
+  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
   rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
@@ -476,14 +554,15 @@ void train_per_subset(const handle_t& handle,
 
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
-    utils::copy_selected(n_rows,
-                         (IdxT)index.pq_len(),
-                         index.centers_rot().data_handle() + index.pq_len() * j,
-                         labels,
-                         (IdxT)index.rot_dim(),
-                         sub_trainset.data(),
-                         (IdxT)index.pq_len(),
-                         stream);
+    utils::copy_selected<float, float, size_t, uint32_t>(
+      n_rows,
+      index.pq_len(),
+      index.centers_rot().data_handle() + index.pq_len() * j,
+      labels,
+      index.rot_dim(),
+      sub_trainset.data(),
+      index.pq_len(),
+      stream);
 
     // sub_trainset is the slice of: rotate(trainset) - centers_rot
     float alpha = 1.0;
@@ -505,26 +584,26 @@ void train_per_subset(const handle_t& handle,
                  stream);
 
     // train PQ codebook for this subspace
-    kmeans::build_clusters(
-      handle,
-      kmeans_n_iters,
-      index.pq_len(),
-      sub_trainset.data(),
-      n_rows,
-      index.pq_book_size(),
-      index.pq_centers().data_handle() + (index.pq_book_size() * index.pq_len()) * j,
-      sub_labels.data(),
-      pq_cluster_sizes.data(),
-      raft::distance::DistanceType::L2Expanded,
-      stream,
-      device_memory);
+    kmeans::build_clusters(handle,
+                           kmeans_n_iters,
+                           index.pq_len(),
+                           sub_trainset.data(),
+                           n_rows,
+                           index.pq_book_size(),
+                           pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
+                           sub_labels.data(),
+                           pq_cluster_sizes.data(),
+                           raft::distance::DistanceType::L2Expanded,
+                           stream,
+                           device_memory);
   }
+  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
 
 template <typename IdxT>
 void train_per_cluster(const handle_t& handle,
                        index<IdxT>& index,
-                       IdxT n_rows,
+                       size_t n_rows,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
@@ -532,26 +611,30 @@ void train_per_cluster(const handle_t& handle,
                        rmm::mr::device_memory_resource* device_memory)
 {
   auto stream = handle.get_stream();
+
+  rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
   rmm::device_uvector<uint32_t> cluster_sizes(index.n_lists(), stream, managed_memory);
   rmm::device_uvector<IdxT> indices_buf(n_rows, stream, device_memory);
   rmm::device_uvector<IdxT> offsets_buf(index.list_offsets().size(), stream, managed_memory);
 
-  raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
-                                         reinterpret_cast<int32_t*>(cluster_sizes.data()),
-                                         IdxT(index.n_lists()),
-                                         labels,
-                                         n_rows,
-                                         1,
-                                         stream);
+  raft::stats::histogram<uint32_t, size_t>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(cluster_sizes.data()),
+                                           index.n_lists(),
+                                           labels,
+                                           n_rows,
+                                           1,
+                                           stream);
 
   auto cluster_offsets      = offsets_buf.data();
   auto indices              = indices_buf.data();
   uint32_t max_cluster_size = calculate_offsets_and_indices(
-    n_rows, index.n_lists(), labels, cluster_sizes.data(), cluster_offsets, indices, stream);
+    IdxT(n_rows), index.n_lists(), labels, cluster_sizes.data(), cluster_offsets, indices, stream);
 
-  rmm::device_uvector<uint32_t> pq_labels(max_cluster_size * index.pq_dim(), stream, device_memory);
+  rmm::device_uvector<uint32_t> pq_labels(
+    size_t(max_cluster_size) * size_t(index.pq_dim()), stream, device_memory);
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
-  rmm::device_uvector<float> rot_vectors(max_cluster_size * index.rot_dim(), stream, device_memory);
+  rmm::device_uvector<float> rot_vectors(
+    size_t(max_cluster_size) * size_t(index.rot_dim()), stream, device_memory);
 
   handle.sync_stream();  // make sure cluster offsets are up-to-date
   for (uint32_t l = 0; l < index.n_lists(); l++) {
@@ -566,15 +649,15 @@ void train_per_cluster(const handle_t& handle,
                      index.dim(),
                      index.rot_dim(),
                      index.rotation_matrix().data_handle(),
-                     index.centers().data_handle() + uint64_t(l) * index.dim_ext(),
+                     index.centers().data_handle() + size_t(l) * size_t(index.dim_ext()),
                      trainset,
                      indices + cluster_offsets[l],
                      device_memory);
 
     // limit the cluster size to bound the training time.
     // [sic] we interpret the data as pq_len-dimensional
-    size_t big_enough     = 256 * std::max(index.pq_book_size(), index.pq_dim());
-    size_t available_rows = cluster_size * index.pq_dim();
+    size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+    size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
     kmeans::build_clusters(
@@ -584,22 +667,27 @@ void train_per_cluster(const handle_t& handle,
       rot_vectors.data(),
       pq_n_rows,
       index.pq_book_size(),
-      index.pq_centers().data_handle() + index.pq_book_size() * index.pq_len() * l,
+      pq_centers_tmp.data() + size_t(index.pq_book_size()) * size_t(index.pq_len()) * size_t(l),
       pq_labels.data(),
       pq_cluster_sizes.data(),
       raft::distance::DistanceType::L2Expanded,
       stream,
       device_memory);
   }
+  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
 
-/** See raft::spatial::knn::ivf_pq::extend docs */
+/**
+ * See raft::spatial::knn::ivf_pq::extend docs.
+ *
+ * This version requires `new_vectors` and `new_indices` (if non-null) to be on-device.
+ */
 template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
-                   const index<IdxT>& orig_index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows) -> index<IdxT>
+inline auto extend_device(const handle_t& handle,
+                          const index<IdxT>& orig_index,
+                          const T* new_vectors,
+                          const IdxT* new_indices,
+                          IdxT n_rows) -> index<IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim());
@@ -611,6 +699,13 @@ inline auto extend(const handle_t& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
+  switch (new_indices != nullptr ? utils::check_pointer_residency(new_vectors, new_indices)
+                                 : utils::check_pointer_residency(new_vectors)) {
+    case utils::pointer_residency::device_only:
+    case utils::pointer_residency::host_and_device: break;
+    default: RAFT_FAIL("[ivf_pq::extend_device] The added data must be available on device.");
+  }
+
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
   if (pool_guard) {
@@ -629,7 +724,8 @@ inline auto extend(const handle_t& handle,
   //
   const auto n_clusters = orig_index.n_lists();
 
-  rmm::device_uvector<float> cluster_centers(n_clusters * orig_index.dim(), stream, device_memory);
+  rmm::device_uvector<float> cluster_centers(
+    size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory);
   RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(),
                                   sizeof(float) * orig_index.dim(),
                                   orig_index.centers().data_handle(),
@@ -683,8 +779,9 @@ inline auto extend(const handle_t& handle,
   //
   // Compute PQ code for new vectors
   //
-  rmm::device_uvector<uint8_t> new_pq_codes(
-    n_rows * orig_index.pq_dim() * orig_index.pq_bits() / 8, stream, device_memory);
+  pq_codes_exts new_pq_exts = make_extents<size_t>(
+    n_rows, orig_index.pq_dataset().extent(1), orig_index.pq_dataset().static_extent(3));
+  auto new_pq_codes = make_device_mdarray<uint8_t>(handle, device_memory, new_pq_exts);
   compute_pq_codes<T>(handle,
                       n_rows,
                       orig_index.dim(),
@@ -701,8 +798,8 @@ inline auto extend(const handle_t& handle,
                       new_data_indices.data(),
                       new_cluster_sizes,
                       new_cluster_offsets.data(),
-                      orig_index.pq_centers().data_handle(),
-                      new_pq_codes.data(),
+                      orig_index.pq_centers(),
+                      new_pq_codes.view(),
                       device_memory);
 
   // Get the combined cluster sizes and sort the clusters in decreasing order
@@ -711,31 +808,27 @@ inline auto extend(const handle_t& handle,
   rmm::device_uvector<uint32_t> ext_cluster_sizes_buf(n_clusters, stream, &managed_memory);
   rmm::device_uvector<IdxT> old_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
   rmm::device_uvector<IdxT> ext_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> cluster_ordering(n_clusters, stream, &managed_memory);
+  rmm::device_uvector<uint32_t> cluster_ordering_buf(n_clusters, stream, &managed_memory);
   auto old_cluster_sizes   = old_cluster_sizes_buf.data();
   auto ext_cluster_sizes   = ext_cluster_sizes_buf.data();
   auto old_cluster_offsets = old_cluster_offsets_buf.data();
   auto ext_cluster_offsets = ext_cluster_offsets_buf.data();
+  auto cluster_ordering    = cluster_ordering_buf.data();
   copy(old_cluster_offsets,
        orig_index.list_offsets().data_handle(),
        orig_index.list_offsets().size(),
        stream);
+  copy(old_cluster_sizes,
+       orig_index.list_sizes().data_handle(),
+       orig_index.list_sizes().size(),
+       stream);
 
   uint32_t n_nonempty_lists = 0;
   {
     rmm::device_uvector<uint32_t> ext_cluster_sizes_buf_in(n_clusters, stream, device_memory);
     rmm::device_uvector<uint32_t> cluster_ordering_in(n_clusters, stream, device_memory);
     auto ext_cluster_sizes_in = ext_cluster_sizes_buf_in.data();
-    linalg::writeOnlyUnaryOp(
-      old_cluster_sizes,
-      n_clusters,
-      [ext_cluster_sizes_in, new_cluster_sizes, old_cluster_offsets] __device__(uint32_t * out,
-                                                                                size_t i) {
-        auto old_size           = old_cluster_offsets[i + 1] - old_cluster_offsets[i];
-        ext_cluster_sizes_in[i] = old_size + new_cluster_sizes[i];
-        *out                    = old_size;
-      },
-      stream);
+    linalg::add(ext_cluster_sizes_in, old_cluster_sizes, new_cluster_sizes, n_clusters, stream);
 
     thrust::sequence(handle.get_thrust_policy(),
                      cluster_ordering_in.data(),
@@ -749,7 +842,7 @@ inline auto extend(const handle_t& handle,
                                               ext_cluster_sizes_in,
                                               ext_cluster_sizes,
                                               cluster_ordering_in.data(),
-                                              cluster_ordering.data(),
+                                              cluster_ordering,
                                               n_clusters,
                                               begin_bit,
                                               end_bit,
@@ -760,7 +853,7 @@ inline auto extend(const handle_t& handle,
                                               ext_cluster_sizes_in,
                                               ext_cluster_sizes,
                                               cluster_ordering_in.data(),
-                                              cluster_ordering.data(),
+                                              cluster_ordering,
                                               n_clusters,
                                               begin_bit,
                                               end_bit,
@@ -775,43 +868,49 @@ inline auto extend(const handle_t& handle,
   }
 
   // Assemble the extended index
-  ivf_pq::index<IdxT> ext_index(handle,
-                                orig_index.metric(),
-                                orig_index.codebook_kind(),
-                                n_clusters,
-                                orig_index.dim(),
-                                orig_index.pq_bits(),
-                                orig_index.pq_dim(),
-                                n_nonempty_lists);
-  ext_index.allocate(handle, orig_index.size() + n_rows);
-
-  // Copy the unchanged parts
-  copy(ext_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().size(),
-       stream);
-
+  index<IdxT> ext_index(handle,
+                        orig_index.metric(),
+                        orig_index.codebook_kind(),
+                        n_clusters,
+                        orig_index.dim(),
+                        orig_index.pq_bits(),
+                        orig_index.pq_dim(),
+                        n_nonempty_lists);
   // calculate extended cluster offsets
-  auto ext_indices = ext_index.indices().data_handle();
   {
-    IdxT zero = 0;
-    update_device(ext_cluster_offsets, &zero, 1, stream);
-    thrust::inclusive_scan(handle.get_thrust_policy(),
-                           ext_cluster_sizes,
-                           ext_cluster_sizes + n_clusters,
-                           ext_cluster_offsets + 1,
-                           [] __device__(IdxT s, uint32_t l) { return s + l; });
+    using group_align = Pow2<kIndexGroupSize>;
+    IdxT size         = 0;
+    update_device(ext_cluster_offsets, &size, 1, stream);
+    thrust::inclusive_scan(
+      handle.get_thrust_policy(),
+      ext_cluster_sizes,
+      ext_cluster_sizes + n_clusters,
+      ext_cluster_offsets + 1,
+      [] __device__(IdxT a, IdxT b) { return group_align::roundUp(a) + group_align::roundUp(b); });
+    update_host(&size, ext_cluster_offsets + n_clusters, 1, stream);
+    handle.sync_stream();
     copy(ext_index.list_offsets().data_handle(),
          ext_cluster_offsets,
          ext_index.list_offsets().size(),
          stream);
+    copy(ext_index.list_sizes().data_handle(),
+         ext_cluster_sizes,
+         ext_index.list_sizes().size(),
+         stream);
+    ext_index.allocate(handle, size);
   }
 
+  // Copy the unchanged parts
+  copy(ext_index.rotation_matrix().data_handle(),
+       orig_index.rotation_matrix().data_handle(),
+       orig_index.rotation_matrix().size(),
+       stream);
+
   // copy cluster-ordering-dependent data
   utils::copy_selected(n_clusters,
                        ext_index.dim_ext(),
                        orig_index.centers().data_handle(),
-                       cluster_ordering.data(),
+                       cluster_ordering,
                        orig_index.dim_ext(),
                        ext_index.centers().data_handle(),
                        ext_index.dim_ext(),
@@ -819,7 +918,7 @@ inline auto extend(const handle_t& handle,
   utils::copy_selected(n_clusters,
                        ext_index.rot_dim(),
                        orig_index.centers_rot().data_handle(),
-                       cluster_ordering.data(),
+                       cluster_ordering,
                        orig_index.rot_dim(),
                        ext_index.centers_rot().data_handle(),
                        ext_index.rot_dim(),
@@ -836,7 +935,7 @@ inline auto extend(const handle_t& handle,
       utils::copy_selected(n_clusters,
                            d,
                            orig_index.pq_centers().data_handle(),
-                           cluster_ordering.data(),
+                           cluster_ordering,
                            d,
                            ext_index.pq_centers().data_handle(),
                            d,
@@ -847,8 +946,9 @@ inline auto extend(const handle_t& handle,
 
   // Make ext_indices
   handle.sync_stream();  // make sure cluster sizes are up-to-date
+  auto ext_indices = ext_index.indices().data_handle();
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    auto k                = cluster_ordering.data()[l];
+    auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
     auto new_cluster_size = new_cluster_sizes[k];
     if (old_cluster_size > 0) {
@@ -878,27 +978,91 @@ inline auto extend(const handle_t& handle,
   }
 
   /* Extend the pq_dataset */
-  auto ext_pq_dataset    = ext_index.pq_dataset().data_handle();
-  size_t pq_dataset_unit = ext_index.pq_dim() * ext_index.pq_bits() / 8;
+  // For simplicity and performance, we reinterpret the last dimension of the dataset
+  // as a single vector element.
+  using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+
+  auto data_unit      = ext_index.pq_dataset().extent(1);
+  auto ext_pq_dataset = make_mdspan<vec_t, size_t, row_major, false, true>(
+    reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
+    make_extents<size_t>(
+      ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
+
   for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    auto k                = cluster_ordering.data()[l];
+    // Extend the data cluster-by-cluster;
+    // The original/old index stores the data interleaved;
+    // the new data produced by `compute_pq_codes` is not interleaved.
+    auto k                = cluster_ordering[l];
     auto old_cluster_size = old_cluster_sizes[k];
-    copy(ext_pq_dataset + pq_dataset_unit * ext_cluster_offsets[l],
-         orig_index.pq_dataset().data_handle() + pq_dataset_unit * old_cluster_offsets[k],
-         pq_dataset_unit * old_cluster_size,
-         stream);
-    copy(ext_pq_dataset + pq_dataset_unit * (ext_cluster_offsets[l] + old_cluster_size),
-         new_pq_codes.data() + pq_dataset_unit * new_cluster_offsets.data()[k],
-         pq_dataset_unit * new_cluster_sizes[k],
-         stream);
+    auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, false, true>(
+      reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
+        data_unit * old_cluster_offsets[k],
+      make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
+                           data_unit,
+                           ext_pq_dataset.extent(2)));
+    auto new_pq_data = make_mdspan<vec_t, size_t, row_major, false, true>(
+      reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
+        data_unit * new_cluster_offsets.data()[k],
+      make_extents<size_t>(new_cluster_sizes[k], data_unit));
+    // Write all cluster data, vec-by-vec
+    linalg::writeOnlyUnaryOp(
+      ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l],
+      data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
+      [old_pq_dataset, new_pq_data, old_cluster_size] __device__(vec_t * out, size_t i_flat) {
+        // find the proper 3D index from the flat offset
+        size_t i[3];
+        for (int r = 2; r > 0; r--) {
+          i[r] = i_flat % old_pq_dataset.extent(r);
+          i_flat /= old_pq_dataset.extent(r);
+        }
+        i[0]        = i_flat;
+        auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2];
+        if (row_ix < old_cluster_size) {
+          // First, pack the original/old data
+          *out = old_pq_dataset(i[0], i[1], i[2]);
+        } else {
+          // Then add the new data
+          row_ix -= old_cluster_size;
+          if (row_ix < new_pq_data.extent(0)) {
+            *out = new_pq_data(row_ix, i[1]);
+          } else {
+            *out = vec_t{};
+          }
+        }
+      },
+      stream);
   }
 
   return ext_index;
 }
 
-/** See raft::spatial::knn::ivf_pq::build docs */
+/** See raft::spatial::knn::ivf_pq::extend docs */
 template <typename T, typename IdxT>
-inline auto build(
+inline auto extend(const handle_t& handle,
+                   const index<IdxT>& orig_index,
+                   const T* new_vectors,
+                   const IdxT* new_indices,
+                   IdxT n_rows) -> index<IdxT>
+{
+  size_t vec_size = sizeof(T) * size_t(n_rows) * size_t(orig_index.dim());
+  size_t ind_size = sizeof(IdxT) * size_t(n_rows);
+  return utils::with_mapped_memory_t{
+    new_vectors, vec_size, [&](const T* new_vectors_dev) {
+      return utils::with_mapped_memory_t{
+        new_indices, ind_size, [&](const IdxT* new_indices_dev) {
+          return extend_device<T, IdxT>(
+            handle, orig_index, new_vectors_dev, new_indices_dev, n_rows);
+        }}();
+    }}();
+}
+
+/**
+ * See raft::spatial::knn::ivf_pq::build docs.
+ *
+ * This version requires `dataset` to be on-device.
+ */
+template <typename T, typename IdxT>
+inline auto build_device(
   const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
   -> index<IdxT>
 {
@@ -909,14 +1073,22 @@ inline auto build(
 
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
 
+  switch (utils::check_pointer_residency(dataset)) {
+    case utils::pointer_residency::device_only:
+    case utils::pointer_residency::host_and_device: break;
+    default: RAFT_FAIL("[ivf_pq::build_device] The dataset pointer must be available on device.");
+  }
+
   auto stream = handle.get_stream();
 
-  ivf_pq::index<IdxT> index(handle, params, dim);
+  index<IdxT> index(handle, params, dim);
   utils::memzero(index.list_offsets().data_handle(), index.list_offsets().size(), stream);
+  utils::memzero(index.list_sizes().data_handle(), index.list_sizes().size(), stream);
 
-  auto trainset_ratio = std::max<IdxT>(
-    1, n_rows / std::max<IdxT>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
-  auto n_rows_train = n_rows / trainset_ratio;
+  auto trainset_ratio = std::max<size_t>(
+    1,
+    size_t(n_rows) / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
+  size_t n_rows_train = n_rows / trainset_ratio;
 
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
@@ -929,9 +1101,21 @@ inline auto build(
   rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
     &managed_memory_upstream, 1024 * 1024);
 
+  // If the trainset is small enough to comfortably fit into device memory, put it there.
+  // Otherwise, use the managed memory.
+  rmm::mr::device_memory_resource* big_memory_resource = &managed_memory;
+  {
+    size_t free_mem, total_mem;
+    constexpr size_t kTolerableRatio = 4;
+    RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem));
+    if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < free_mem) {
+      big_memory_resource = device_memory;
+    }
+  }
+
   // Besides just sampling, we transform the input dataset into floats to make it easier
   // to use gemm operations from cublas.
-  rmm::device_uvector<float> trainset(n_rows_train * index.dim(), stream, device_memory);
+  rmm::device_uvector<float> trainset(n_rows_train * index.dim(), stream, big_memory_resource);
   // TODO: a proper sampling
   if constexpr (std::is_same_v<T, float>) {
     RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
@@ -946,10 +1130,10 @@ inline auto build(
     auto dim = index.dim();
     linalg::writeOnlyUnaryOp(
       trainset.data(),
-      index.dim() * n_rows_train,
+      size_t(index.dim()) * n_rows_train,
       [dataset, trainset_ratio, dim] __device__(float* out, size_t i) {
         auto col = i % dim;
-        *out     = utils::mapping<float>{}(dataset[(i - col) * trainset_ratio + col]);
+        *out     = utils::mapping<float>{}(dataset[(i - col) * size_t(trainset_ratio) + col]);
       },
       stream);
   }
@@ -971,7 +1155,7 @@ inline auto build(
                              stream);
 
   // Trainset labels are needed for training PQ codebooks
-  rmm::device_uvector<uint32_t> labels(n_rows_train, stream, device_memory);
+  rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
   kmeans::predict(handle,
                   cluster_centers,
                   index.n_lists(),
@@ -995,8 +1179,14 @@ inline auto build(
                                     stream));
 
     rmm::device_uvector<float> center_norms(index.n_lists(), stream, device_memory);
-    utils::dots_along_rows(
-      index.n_lists(), index.dim(), cluster_centers, center_norms.data(), stream);
+    raft::linalg::rowNorm(center_norms.data(),
+                          cluster_centers,
+                          index.dim(),
+                          index.n_lists(),
+                          raft::linalg::L2Norm,
+                          true,
+                          stream,
+                          raft::SqrtOp<float>());
     RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle() + index.dim(),
                                     sizeof(float) * index.dim_ext(),
                                     center_norms.data(),
@@ -1060,10 +1250,23 @@ inline auto build(
 
   // add the data if necessary
   if (params.add_data_on_build) {
-    return detail::extend<T, IdxT>(handle, index, dataset, nullptr, n_rows);
+    return detail::extend_device<T, IdxT>(handle, index, dataset, nullptr, n_rows);
   } else {
     return index;
   }
 }
 
+/** See raft::spatial::knn::ivf_pq::build docs */
+template <typename T, typename IdxT>
+inline auto build(
+  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
+  -> index<IdxT>
+{
+  size_t data_size = sizeof(T) * size_t(n_rows) * size_t(dim);
+  return utils::with_mapped_memory_t{dataset, data_size, [&](const T* dataset_dev) {
+                                       return build_device<T, IdxT>(
+                                         handle, params, dataset_dev, n_rows, dim);
+                                     }}();
+}
+
 }  // namespace raft::spatial::knn::ivf_pq::detail
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index b1f47a6c52..c1a3682f47 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -16,11 +16,12 @@
 
 #pragma once
 
-#include "../ivf_pq_types.hpp"
 #include "ann_utils.cuh"
 #include "topk.cuh"
 #include "topk/warpsort_topk.cuh"
 
+#include <raft/neighbors/ivf_pq_types.hpp>
+
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/handle.hpp>
@@ -38,6 +39,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/cub.cuh>
+#include <thrust/fill.h>
 #include <thrust/sequence.h>
 
 #include <cuda_fp16.h>
@@ -56,6 +58,12 @@ static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
+using raft::neighbors::ivf_pq::codebook_gen;
+using raft::neighbors::ivf_pq::index;
+using raft::neighbors::ivf_pq::kIndexGroupSize;
+using raft::neighbors::ivf_pq::kIndexGroupVecLen;
+using raft::neighbors::ivf_pq::search_params;
+
 /** 8-bit floating-point storage type.
  *
  * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined
@@ -78,6 +86,7 @@ struct fp_8bit {
     return *this;
   }
   HDI explicit operator float() const { return fp_8bit2float(*this); }
+  HDI explicit operator half() const { return half(fp_8bit2float(*this)); }
 
  private:
   static constexpr float kMin = 1.0f / float(1u << ExpMask);
@@ -226,10 +235,10 @@ void select_clusters(const handle_t& handle,
  * in chunk_indices. Essentially this is a segmented inclusive scan of the cluster sizes. The total
  * number of samples per query (sum of the cluster sizes that we probe) is returned in n_samples.
  */
-template <int BlockDim, typename IdxT>
+template <int BlockDim>
 __launch_bounds__(BlockDim) __global__
   void calc_chunk_indices_kernel(uint32_t n_probes,
-                                 const IdxT* cluster_offsets,        // [n_clusters + 1]
+                                 const uint32_t* cluster_sizes,      // [n_clusters]
                                  const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                                  uint32_t* chunk_indices,            // [n_queries, n_probes]
                                  uint32_t* n_samples                 // [n_queries]
@@ -247,9 +256,7 @@ __launch_bounds__(BlockDim) __global__
   uint32_t total                  = 0;
   for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) {
     auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u;
-    auto chunk = probe_ix < n_probes
-                   ? static_cast<uint32_t>(cluster_offsets[label + 1] - cluster_offsets[label])
-                   : 0u;
+    auto chunk = probe_ix < n_probes ? cluster_sizes[label] : 0u;
     if (threadIdx.x == 0) { chunk += total; }
     block_scan(shm).InclusiveSum(chunk, chunk, total);
     __syncthreads();
@@ -259,7 +266,6 @@ __launch_bounds__(BlockDim) __global__
   if (threadIdx.x == 0) { n_samples[blockIdx.x] = total; }
 }
 
-template <typename IdxT>
 struct calc_chunk_indices {
  public:
   struct configured {
@@ -268,19 +274,19 @@ struct calc_chunk_indices {
     dim3 grid_dim;
     uint32_t n_probes;
 
-    void operator()(const IdxT* cluster_offsets,
-                    const uint32_t* clusters_to_probe,
-                    uint32_t* chunk_indices,
-                    uint32_t* n_samples,
-                    rmm::cuda_stream_view stream)
+    inline void operator()(const uint32_t* cluster_sizes,
+                           const uint32_t* clusters_to_probe,
+                           uint32_t* chunk_indices,
+                           uint32_t* n_samples,
+                           rmm::cuda_stream_view stream)
     {
       void* args[] =  // NOLINT
-        {&n_probes, &cluster_offsets, &clusters_to_probe, &chunk_indices, &n_samples};
+        {&n_probes, &cluster_sizes, &clusters_to_probe, &chunk_indices, &n_samples};
       RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, args, 0, stream));
     }
   };
 
-  static auto configure(uint32_t n_probes, uint32_t n_queries) -> configured
+  static inline auto configure(uint32_t n_probes, uint32_t n_queries) -> configured
   {
     return try_block_dim<1024>(n_probes, n_queries);
   }
@@ -292,7 +298,7 @@ struct calc_chunk_indices {
     if constexpr (BlockDim >= WarpSize * 2) {
       if (BlockDim >= n_probes * 2) { return try_block_dim<(BlockDim / 2)>(n_probes, n_queries); }
     }
-    return {reinterpret_cast<void*>(calc_chunk_indices_kernel<BlockDim, IdxT>),
+    return {reinterpret_cast<void*>(calc_chunk_indices_kernel<BlockDim>),
             dim3(BlockDim, 1, 1),
             dim3(n_queries, 1, 1),
             n_probes};
@@ -321,7 +327,7 @@ __device__ auto find_db_row(IdxT& x,  // NOLINT
   uint32_t ix_max = n_probes;
   do {
     uint32_t i = (ix_min + ix_max) / 2;
-    if (IdxT(chunk_indices[i]) < x) {
+    if (IdxT(chunk_indices[i]) <= x) {
       ix_min = i + 1;
     } else {
       ix_max = i;
@@ -359,7 +365,7 @@ __launch_bounds__(BlockDim) __global__
                         clusters_to_probe + n_probes * query_ix,
                         chunk_indices + n_probes * query_ix);
   }
-  neighbors[k] = valid ? db_indices[data_ix] : std::numeric_limits<IdxT>::max();
+  neighbors[k] = valid ? db_indices[data_ix] : index<IdxT>::kOutOfBoundsRecord;
 }
 
 /**
@@ -367,7 +373,7 @@ __launch_bounds__(BlockDim) __global__
  * (as stored in index.indices()).
  *
  * When the main kernel runs with a fused top-k (`manage_local_topk == true`), this function simply
- * fetches the index values  by the returned row ids. Otherwise, the found neighors require extra
+ * fetches the index values  by the returned row ids. Otherwise, the found neighbors require extra
  * pre-processing (performed by `find_db_row`).
  */
 template <typename IdxT>
@@ -444,68 +450,16 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   }
 }
 
-/**
- * @brief Compute the similarity score between a vector from `pq_dataset` and a query vector.
- *
- * @tparam OpT an unsigned integer type that is used for bit operations on multiple PQ codes
- *   at once; it's selected to maximize throughput while matching criteria:
- *     1. `pq_bits * vec_len % 8 * sizeof(OpT) == 0`.
- *     2. `pq_dim % vec_len == 0`
- *
- * @tparam LutT type of the elements in the lookup table.
- *
- * @param pq_bits The bit length of an encoded vector element after compression by PQ
- * @param vec_len == 8 * sizeof(OpT) / gcd(8 * sizeof(OpT), pq_bits)
- * @param pq_dim
- * @param[in] pq_code_ptr
- *   a device pointer to the dataset at the indexed position (`pq_dim * pq_bits` bits-wide)
- * @param[in] lut_scores
- *   a device or shared memory pointer to the lookup table [pq_dim, pq_book_size]
- *
- * @return the score for the entry `data_ix` in the `pq_dataset`.
- */
-template <typename OpT, typename LutT>
-__device__ auto ivfpq_compute_score(
-  uint32_t pq_bits, uint32_t vec_len, uint32_t pq_dim, const OpT* pq_head, const LutT* lut_scores)
-  -> float
-{
-  float score                   = 0.0;
-  constexpr uint32_t kBitsTotal = 8 * sizeof(OpT);
-  for (; pq_dim > 0; pq_dim -= vec_len) {
-    OpT pq_code = pq_head[0];
-    pq_head++;
-    auto bits_left = kBitsTotal;
-    for (uint32_t k = 0; k < vec_len; k++) {
-      uint8_t code = pq_code;
-      if (bits_left > pq_bits) {
-        pq_code >>= pq_bits;
-        bits_left -= pq_bits;
-      } else {
-        if (k < vec_len - 1) {
-          pq_code = pq_head[0];
-          pq_head++;
-        }
-        code |= (pq_code << bits_left);
-        pq_code >>= (pq_bits - bits_left);
-        bits_left += (kBitsTotal - pq_bits);
-      }
-      code &= (1 << pq_bits) - 1;
-      score += float(lut_scores[code]);
-      lut_scores += (1 << pq_bits);
-    }
-  }
-  return score;
-}
-
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = topk::warp_sort_immediate<WarpSize, true, T, IdxT>;
-  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf){};
+  using queue_t = topk::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  template <typename... Args>
+  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf, Args...){};
 };
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = topk::block_sort<topk::warp_sort_immediate, Capacity, true, T, IdxT>;
+  using type = topk::block_sort<topk::warp_sort_distributed, Capacity, true, T, IdxT>;
 };
 
 template <typename T, typename IdxT>
@@ -516,6 +470,82 @@ struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t<T, IdxT> {
 template <int Capacity, typename T, typename IdxT>
 using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
 
+/* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
+template <typename OutT,
+          typename LutT,
+          typename VecT,
+          bool CheckBounds,
+          uint32_t PqBits,
+          uint32_t BitsLeft = 0,
+          uint32_t Ix       = 0>
+__device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
+                                                    typename VecT::math_t& pq_code,
+                                                    const VecT& pq_codes,
+                                                    const LutT*& lut_head,
+                                                    const LutT*& lut_end)
+{
+  if constexpr (CheckBounds) {
+    if (lut_head >= lut_end) { return; }
+  }
+  constexpr uint32_t kTotalBits = 8 * sizeof(typename VecT::math_t);
+  constexpr uint32_t kPqShift   = 1u << PqBits;
+  constexpr uint32_t kPqMask    = kPqShift - 1u;
+  if constexpr (BitsLeft >= PqBits) {
+    uint8_t code = pq_code & kPqMask;
+    pq_code >>= PqBits;
+    score += OutT(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  } else if constexpr (Ix < VecT::Ratio) {
+    uint8_t code                = pq_code;
+    pq_code                     = pq_codes.val.data[Ix];
+    constexpr uint32_t kRemBits = PqBits - BitsLeft;
+    constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
+    code |= (pq_code & kRemMask) << BitsLeft;
+    pq_code >>= kRemBits;
+    score += OutT(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<OutT,
+                               LutT,
+                               VecT,
+                               CheckBounds,
+                               PqBits,
+                               kTotalBits - kRemBits,
+                               Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
+  }
+}
+
+/* Compute the similarity for one vector in the pq_dataset */
+template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
+__device__ auto ivfpq_compute_score(uint32_t pq_dim,
+                                    const typename VecT::io_t* pq_head,
+                                    const LutT* lut_scores,
+                                    OutT early_stop_limit) -> OutT
+{
+  constexpr uint32_t kChunkSize = sizeof(VecT) * 8u / PqBits;
+  auto lut_head                 = lut_scores;
+  auto lut_end                  = lut_scores + (pq_dim << PqBits);
+  VecT pq_codes;
+  OutT score{0};
+  for (; pq_dim >= kChunkSize; pq_dim -= kChunkSize) {
+    *pq_codes.vectorized_data() = *pq_head;
+    pq_head += kIndexGroupSize;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+    // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
+    if (score >= early_stop_limit) { return score; }
+  }
+  if (pq_dim > 0) {
+    *pq_codes.vectorized_data()   = *pq_head;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  }
+  return score;
+}
+
 /**
  * The main kernel that computes similarity scores across multiple queries and probes.
  * When `Capacity > 0`, it also selects top K candidates for each query and probe
@@ -524,14 +554,15 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  * Each block processes a (query, probe) pair: it calculates the distance between the single query
  * vector and all the dataset vector in the cluster that we are probing.
  *
- * @tparam OpT is a carrier integer type selected to maximize throughput;
- *   Used solely in `ivfpq_compute_score`;
  * @tparam IdxT
  *   The type of data indices
  * @tparam OutT
  *   The output type - distances.
  * @tparam LutT
  *   The lookup table element type (lut_scores).
+ * @tparam PqBits
+ *   The bit length of an encoded vector element after compression by PQ
+ *   (NB: pq_book_size = 1 << PqBits).
  * @tparam Capacity
  *   Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search.
  * @tparam PrecompBaseDiff
@@ -546,8 +577,6 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  * @param n_rows the number of records in the dataset
  * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`).
  * @param n_probes the number of clusters to search for each query
- * @param pq_bits the bit length of an encoded vector element after compression by PQ
- *   (NB: pq_book_size = 1 << pq_bits).
  * @param pq_dim
  *   The dimensionality of an encoded vector after compression by PQ.
  * @param n_queries the number of queries.
@@ -561,7 +590,7 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  *   The device pointer to the cluster centers in the PQ space
  *   [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,].
  * @param pq_dataset
- *   The device pointer to the PQ index (data) [n_rows, pq_dim * pq_bits / 8].
+ *   The device pointer to the PQ index (data) [n_rows, ...].
  * @param cluster_offsets
  *   The device pointer to the cluster offsets [n_clusters + 1].
  * @param cluster_labels
@@ -574,7 +603,7 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  *   An optional device pointer to the enforced order of search [n_queries, n_probes].
  *   One can pass reordered indices here to try to improve data reading locality.
  * @param lut_scores
- *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << pq_bits].
+ *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits].
  *   Ignored when `EnableSMemLut == true`.
  * @param _out_scores
  *   The device pointer to the output scores
@@ -583,75 +612,80 @@ using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
  *   The device pointer to the output indices [n_queries, n_probes, topk].
  *   Ignored  when `Capacity == 0`.
  */
-template <typename OpT,
-          typename IdxT,
+template <typename IdxT,
           typename OutT,
           typename LutT,
+          uint32_t PqBits,
           int Capacity,
           bool PrecompBaseDiff,
           bool EnableSMemLut>
-__launch_bounds__(1024) __global__
-  void ivfpq_compute_similarity_kernel(uint32_t n_rows,
-                                       uint32_t dim,
-                                       uint32_t n_probes,
-                                       uint32_t pq_bits,
-                                       uint32_t pq_dim,
-                                       uint32_t n_queries,
-                                       distance::DistanceType metric,
-                                       codebook_gen codebook_kind,
-                                       uint32_t topk,
-                                       const float* cluster_centers,
-                                       const float* pq_centers,
-                                       const uint8_t* pq_dataset,
-                                       const IdxT* cluster_offsets,
-                                       const uint32_t* cluster_labels,
-                                       const uint32_t* _chunk_indices,
-                                       const float* queries,
-                                       const uint32_t* index_list,
-                                       LutT* lut_scores,
-                                       OutT* _out_scores,
-                                       IdxT* _out_indices)
+__global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
+                                                uint32_t dim,
+                                                uint32_t n_probes,
+                                                uint32_t pq_dim,
+                                                uint32_t n_queries,
+                                                distance::DistanceType metric,
+                                                codebook_gen codebook_kind,
+                                                uint32_t topk,
+                                                const float* cluster_centers,
+                                                const float* pq_centers,
+                                                const uint8_t* pq_dataset,
+                                                const IdxT* cluster_offsets,
+                                                const uint32_t* cluster_labels,
+                                                const uint32_t* _chunk_indices,
+                                                const float* queries,
+                                                const uint32_t* index_list,
+                                                float* query_kths,
+                                                LutT* lut_scores,
+                                                OutT* _out_scores,
+                                                IdxT* _out_indices)
 {
   /* Shared memory:
 
-    * lut_scores: lookup table (LUT) of size = `pq_dim << pq_bits`  (when EnableSMemLut)
-    * base_diff: size = dim  (which is equal to `pq_dim * pq_len`)
+    * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits`  (when EnableSMemLut)
+    * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
     * topk::block_sort: some amount of shared memory, but overlaps with the rest:
         block_sort only needs shared memory for `.done()` operation, which can come very last.
   */
   extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
   constexpr bool kManageLocalTopK = Capacity > 0;
-  constexpr uint32_t kOpBits      = 8 * sizeof(OpT);
 
-  const uint32_t pq_len  = dim / pq_dim;
-  const uint32_t vec_len = kOpBits / gcd<uint32_t>(kOpBits, pq_bits);
+  constexpr uint32_t PqShift = 1u << PqBits;  // NOLINT
+  constexpr uint32_t PqMask  = PqShift - 1u;  // NOLINT
+
+  const uint32_t pq_len   = dim / pq_dim;
+  const uint32_t lut_size = pq_dim * PqShift;
 
   if constexpr (EnableSMemLut) {
     lut_scores = reinterpret_cast<LutT*>(smem_buf);
   } else {
-    lut_scores += (pq_dim << pq_bits) * blockIdx.x;
+    lut_scores += lut_size * blockIdx.x;
   }
 
   float* base_diff = nullptr;
   if constexpr (PrecompBaseDiff) {
     if constexpr (EnableSMemLut) {
-      base_diff = reinterpret_cast<float*>(lut_scores + (pq_dim << pq_bits));
+      base_diff = reinterpret_cast<float*>(lut_scores + lut_size);
     } else {
       base_diff = reinterpret_cast<float*>(smem_buf);
     }
   }
 
   for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) {
+    if (ib >= gridDim.x) {
+      // sync shared memory accesses on the second and further iterations
+      __syncthreads();
+    }
     uint32_t query_ix;
     uint32_t probe_ix;
     if (index_list == nullptr) {
       query_ix = ib % n_queries;
       probe_ix = ib / n_queries;
     } else {
-      query_ix = index_list[ib] / n_probes;
-      probe_ix = index_list[ib] % n_probes;
+      auto ordered_ix = index_list[ib];
+      query_ix        = ordered_ix / n_probes;
+      probe_ix        = ordered_ix % n_probes;
     }
-    if (query_ix >= n_queries || probe_ix >= n_probes) continue;
 
     const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
     const float* query            = queries + (dim * query_ix);
@@ -672,184 +706,292 @@ __launch_bounds__(1024) __global__
     if (codebook_kind == codebook_gen::PER_SUBSPACE) {
       pq_center = pq_centers;
     } else {
-      pq_center = pq_centers + (pq_len << pq_bits) * label;
+      pq_center = pq_centers + (pq_len << PqBits) * label;
     }
 
     if constexpr (PrecompBaseDiff) {
-      // Reduce computational complexity by pre-computing the difference
-      // between the cluster centroid and the query.
-      for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-        base_diff[i] = query[i] - cluster_center[i];
-      }
-      __syncthreads();
-    }
-
-    // Create a lookup table
-    // For each subspace, the lookup table stores the distance between the actual query vector
-    // (projected into the subspace) and all possible pq vectors in that subspace.
-    for (uint32_t i = threadIdx.x; i < (pq_dim << pq_bits); i += blockDim.x) {
-      uint32_t i_pq   = i >> pq_bits;
-      uint32_t i_code = codebook_kind == codebook_gen::PER_CLUSTER ? i & ((1 << pq_bits) - 1) : i;
-      float score     = 0.0;
+      // Reduce number of memory reads later by pre-computing parts of the score
       switch (metric) {
         case distance::DistanceType::L2Expanded: {
-          for (uint32_t j = 0; j < pq_len; j++) {
-            uint32_t k = j + (pq_len * i_pq);
-            float diff;
-            if constexpr (PrecompBaseDiff) {
-              diff = base_diff[k];
-            } else {
-              diff = query[k] - cluster_center[k];
-            }
-            diff -= pq_center[j + pq_len * i_code];
-            score += diff * diff;
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            base_diff[i] = query[i] - cluster_center[i];
           }
         } break;
         case distance::DistanceType::InnerProduct: {
-          for (uint32_t j = 0; j < pq_len; j++) {
-            uint32_t k = j + (pq_len * i_pq);
-            score += query[k] * (cluster_center[k] + pq_center[j + pq_len * i_code]);
+          float2 pvals;
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            pvals.x                                 = query[i];
+            pvals.y                                 = cluster_center[i] * pvals.x;
+            reinterpret_cast<float2*>(base_diff)[i] = pvals;
           }
         } break;
+        default: __builtin_unreachable();
       }
-      lut_scores[i] = LutT(score);
+      __syncthreads();
     }
 
-    uint32_t sample_offset = 0;
-    if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
-    uint32_t n_samples   = chunk_indices[probe_ix] - sample_offset;
-    uint32_t n_samples32 = Pow2<32>::roundUp(n_samples);
-    IdxT cluster_offset  = cluster_offsets[label];
+    {
+      // Create a lookup table
+      // For each subspace, the lookup table stores the distance between the actual query vector
+      // (projected into the subspace) and all possible pq vectors in that subspace.
+      for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
+        const uint32_t i_pq  = i >> PqBits;
+        uint32_t j           = i_pq * pq_len;
+        const uint32_t j_end = pq_len + j;
+        auto cur_pq_center   = pq_center + (i & PqMask) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * PqShift : 0u);
+        float score = 0.0;
+        do {
+          float pq_c = *cur_pq_center;
+          cur_pq_center += PqShift;
+          switch (metric) {
+            case distance::DistanceType::L2Expanded: {
+              float diff;
+              if constexpr (PrecompBaseDiff) {
+                diff = base_diff[j];
+              } else {
+                diff = query[j] - cluster_center[j];
+              }
+              diff -= pq_c;
+              score += diff * diff;
+            } break;
+            case distance::DistanceType::InnerProduct: {
+              // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
+              float q;
+              if constexpr (PrecompBaseDiff) {
+                float2 pvals = reinterpret_cast<float2*>(base_diff)[j];
+                q            = pvals.x;
+                score -= pvals.y;
+              } else {
+                q = query[j];
+                score -= q * cluster_center[j];
+              }
+              score -= q * pq_c;
+            } break;
+            default: __builtin_unreachable();
+          }
+        } while (++j < j_end);
+        lut_scores[i] = LutT(score);
+      }
+    }
 
+    // Define helper types for efficient access to the pq_dataset, which is stored in an interleaved
+    // format. The chunks of PQ data are stored in kIndexGroupVecLen-bytes-long chunks, interleaved
+    // in groups of kIndexGroupSize elems (which is normally equal to the warp size) for the fastest
+    // possible access by thread warps.
+    //
+    // Consider one record in the pq_dataset is `pq_dim * pq_bits`-bit-long.
+    // Assuming `kIndexGroupVecLen = 16`, one chunk of data read by a thread at once is 128-bits.
+    // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
+    // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
+    // so that the warp can achieve the best coalesced read throughput.
+    using group_align  = Pow2<kIndexGroupSize>;
+    using vec_align    = Pow2<kIndexGroupVecLen>;
     using local_topk_t = block_sort_t<Capacity, OutT, IdxT>;
-    local_topk_t block_topk(topk, smem_buf);
+    using op_t         = uint32_t;
+    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
 
-    // Ensure lut_scores is written by all threads before using it in ivfpq_compute_score
+    uint32_t sample_offset = 0;
+    if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
+    uint32_t n_samples            = chunk_indices[probe_ix] - sample_offset;
+    uint32_t n_samples_aligned    = group_align::roundUp(n_samples);
+    IdxT cluster_offset           = cluster_offsets[label];
+    constexpr uint32_t kChunkSize = (kIndexGroupVecLen * 8u) / PqBits;
+    uint32_t pq_line_width        = div_rounding_up_unsafe(pq_dim, kChunkSize) * kIndexGroupVecLen;
+    auto pq_thread_data =
+      pq_dataset + (size_t(cluster_offset) + group_align::roundDown(threadIdx.x)) * pq_line_width +
+      group_align::mod(threadIdx.x) * vec_align::Value;
+    pq_line_width *= blockDim.x;
+
+    constexpr OutT kDummy = upper_bound<OutT>();
+    OutT query_kth        = kDummy;
+    if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
+    local_topk_t block_topk(topk, smem_buf, query_kth);
+    OutT early_stop_limit = kDummy;
+    switch (metric) {
+      // If the metric is non-negative, we can use the query_kth approximation as an early stop
+      // threshold to skip some iterations when computing the score. Add such metrics here.
+      case distance::DistanceType::L2Expanded: {
+        early_stop_limit = query_kth;
+      } break;
+      default: break;
+    }
+
+    // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
     __threadfence_block();
     __syncthreads();
 
     // Compute a distance for each sample
-    const uint32_t pq_line_width = pq_dim * pq_bits / 8;
-    for (uint32_t i = threadIdx.x; i < n_samples32; i += blockDim.x) {
-      OutT score = local_topk_t::queue_t::kDummy;
-      if (i < n_samples) {
-        auto pq_ptr =
-          reinterpret_cast<const OpT*>(pq_dataset + uint64_t(pq_line_width) * (cluster_offset + i));
-        float fscore = ivfpq_compute_score<OpT, LutT>(pq_bits, vec_len, pq_dim, pq_ptr, lut_scores);
-        switch (metric) {
-          // For similarity metrics,
-          // we negate the scores as we hardcoded select-topk to always take the minimum
-          case distance::DistanceType::InnerProduct: fscore = -fscore; break;
-          default: break;
-        }
-        if (fscore < float(score)) { score = OutT{fscore}; }
+    for (uint32_t i = threadIdx.x; i < n_samples_aligned;
+         i += blockDim.x, pq_thread_data += pq_line_width) {
+      OutT score = kDummy;
+      bool valid = i < n_samples;
+      if (valid) {
+        score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
+          pq_dim,
+          reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
+          lut_scores,
+          early_stop_limit);
       }
       if constexpr (kManageLocalTopK) {
         block_topk.add(score, cluster_offset + i);
       } else {
-        if (i < n_samples) { out_scores[i + sample_offset] = score; }
+        if (valid) { out_scores[i + sample_offset] = score; }
       }
     }
-    __syncthreads();
     if constexpr (kManageLocalTopK) {
-      // sync threads before and after the topk merging operation, because we reuse smem_buf
+      // sync threads before the topk merging operation, because we reuse smem_buf
+      __syncthreads();
       block_topk.done();
       block_topk.store(out_scores, out_indices);
-      __syncthreads();
+      if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
     } else {
       // fill in the rest of the out_scores with dummy values
       uint32_t max_samples = uint32_t(Pow2<128>::roundUp(cluster_offsets[n_probes]));
       if (probe_ix + 1 == n_probes) {
         for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples;
              i += blockDim.x) {
-          out_scores[i] = local_topk_t::queue_t::kDummy;
+          out_scores[i] = kDummy;
         }
       }
     }
   }
 }
 
+/**
+ * An approximation to the number of times each cluster appears in a batched sample.
+ *
+ * If the pairs (probe_ix, query_ix) are sorted by the probe_ix, there is a good chance that
+ * the same probe_ix (cluster) is processed by several blocks on a single SM. This greatly
+ * increases the L1 cache hit rate (i.e. increases the data locality).
+ *
+ * This function gives an estimate of how many times a specific cluster may appear in the
+ * batch. Thus, it gives a practical limit to how many blocks should be active on the same SM
+ * to improve the L1 cache hit rate.
+ */
+constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
+                                                 uint32_t n_probes,
+                                                 uint32_t n_queries) -> uint32_t
+{
+  /*
+    Let say:
+      n = n_clusters
+      k = n_probes
+      m = n_queries
+      r = # of times a specific block appears in the batched sample.
+
+    Then, r has the Binomial distribution (p = k / n):
+      P(r) = C(m,r) * k^r * (n - k)^(m - r) / n^m
+      E[r] = m * k / n
+      E[r | r > 0] = m * k / n / (1 - (1 - k/n)^m)
+
+    The latter can be approximated by a much simpler formula, assuming (k / n) -> 0:
+      E[r | r > 0] = 1 + (m - 1) * k / (2 * n) + O( (k/n)^2 )
+   */
+  return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
+}
+
+/**
+ * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`
+ * (which does not take into account `reservedSharedMemPerBlock`),
+ * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
+ *
+ * NB: As per the programming guide, the memory carveout setting is just a hint for the driver; it's
+ * free to choose any shmem-L1 configuration it deems appropriate. For example, if you set the
+ * carveout to zero, it will choose a non-zero config that will allow to run at least one active
+ * block per SM.
+ *
+ * @param shmem_fraction
+ *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
+ * @param shmem_per_block
+ *   a shared memory usage per block (dynamic + static shared memory sizes), in bytes.
+ * @param dev_props
+ *   device properties.
+ * @return
+ *   a carveout value in percents [0, 100].
+ */
+constexpr inline auto estimate_carveout(double shmem_fraction,
+                                        size_t shmem_per_block,
+                                        const cudaDeviceProp& dev_props) -> int
+{
+  using shmem_unit = Pow2<128>;
+  size_t m         = shmem_unit::roundUp(shmem_per_block);
+  size_t r         = dev_props.reservedSharedMemPerBlock;
+  size_t s         = dev_props.sharedMemPerMultiprocessor;
+  return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
+}
+
 /**
  * This structure selects configurable template parameters (instance) based on
  * the search/index parameters at runtime.
  *
- * This is done by means of recusively iterating through a small set of possible
+ * This is done by means of recursively iterating through a small set of possible
  * values for every parameter.
  */
 template <typename IdxT, typename OutT, typename LutT>
 struct ivfpq_compute_similarity {
-  using kernel_t = void (*)(uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            uint32_t,
-                            distance::DistanceType,
-                            codebook_gen,
-                            uint32_t,
-                            const float*,
-                            const float*,
-                            const uint8_t*,
-                            const IdxT*,
-                            const uint32_t*,
-                            const uint32_t*,
-                            const float*,
-                            const uint32_t*,
-                            LutT*,
-                            OutT*,
-                            IdxT*);
+  using kernel_t = decltype(&ivfpq_compute_similarity_kernel<IdxT, OutT, LutT, 8, 0, true, true>);
 
   template <bool PrecompBaseDiff, bool EnableSMemLut>
   struct configured {
    public:
-    /**
-     * Select a proper kernel instance based on the runtime parameters.
-     *
-     * @param pq_bits
-     * @param pq_dim
-     * @param k_max
-     */
-    static auto kernel(uint32_t pq_bits, uint32_t pq_dim, uint32_t k_max) -> kernel_t
+    /** Select a proper kernel instance based on the runtime parameters. */
+    static auto kernel(uint32_t pq_bits, uint32_t k_max) -> kernel_t
     {
-      return kernel_base(pq_bits, pq_dim, k_max);
+      switch (pq_bits) {
+        case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max);
+        case 5: return kernel_try_capacity<5, kMaxCapacity>(k_max);
+        case 6: return kernel_try_capacity<6, kMaxCapacity>(k_max);
+        case 7: return kernel_try_capacity<7, kMaxCapacity>(k_max);
+        case 8: return kernel_try_capacity<8, kMaxCapacity>(k_max);
+        default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+      }
     }
 
    private:
-    template <typename OpT, int Capacity>
+    template <uint32_t PqBits, int Capacity>
     static auto kernel_try_capacity(uint32_t k_max) -> kernel_t
     {
       if constexpr (Capacity > 0) {
-        if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<OpT, 0>(k_max); }
+        if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<PqBits, 0>(k_max); }
       }
-      if constexpr (Capacity > 32) {
-        if (k_max * 2 <= Capacity) { return kernel_try_capacity<OpT, (Capacity / 2)>(k_max); }
+      if constexpr (Capacity > 1) {
+        if (k_max * 2 <= Capacity) { return kernel_try_capacity<PqBits, (Capacity / 2)>(k_max); }
       }
-      return ivfpq_compute_similarity_kernel<OpT,
-                                             IdxT,
+      return ivfpq_compute_similarity_kernel<IdxT,
                                              OutT,
                                              LutT,
+                                             PqBits,
                                              Capacity,
                                              PrecompBaseDiff,
                                              EnableSMemLut>;
     }
+  };
+
+  /** Estimate the occupancy for the given kernel on the given device. */
+  struct occupancy_t {
+    using shmem_unit = Pow2<128>;
 
-    static auto kernel_base(uint32_t pq_bits, uint32_t pq_dim, uint32_t k_max) -> kernel_t
+    int blocks_per_sm = 0;
+    double occupancy  = 0.0;
+    double shmem_use  = 1.0;
+
+    inline occupancy_t() = default;
+    inline occupancy_t(size_t smem,
+                       uint32_t n_threads,
+                       kernel_t kernel,
+                       const cudaDeviceProp& dev_props)
     {
-      switch (gcd<uint32_t>(pq_bits * pq_dim, 64)) {
-        case 64: return kernel_try_capacity<uint64_t, kMaxCapacity>(k_max);
-        case 32: return kernel_try_capacity<uint32_t, kMaxCapacity>(k_max);
-        case 16: return kernel_try_capacity<uint16_t, kMaxCapacity>(k_max);
-        case 8: return kernel_try_capacity<uint8_t, kMaxCapacity>(k_max);
-        default:
-          RAFT_FAIL("`pq_bits * pq_dim` must be a multiple of 8 (pq_bits = %u, pq_dim = %u).",
-                    pq_bits,
-                    pq_dim);
-      }
+      RAFT_CUDA_TRY(
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, n_threads, smem));
+      occupancy = double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
+      shmem_use = double(shmem_unit::roundUp(smem) * blocks_per_sm) /
+                  double(dev_props.sharedMemPerMultiprocessor);
     }
   };
 
   struct selected {
-    void* kernel;
+    kernel_t kernel;
     dim3 grid_dim;
     dim3 block_dim;
     size_t smem_size;
@@ -858,8 +1000,8 @@ struct ivfpq_compute_similarity {
     template <typename... Args>
     void operator()(rmm::cuda_stream_view stream, Args... args)
     {
-      void* xs[] = {&args...};  // NOLINT
-      RAFT_CUDA_TRY(cudaLaunchKernel(kernel, grid_dim, block_dim, xs, smem_size, stream));
+      kernel<<<grid_dim, block_dim, smem_size, stream>>>(args...);
+      RAFT_CHECK_CUDA(stream);
     }
   };
 
@@ -873,128 +1015,231 @@ struct ivfpq_compute_similarity {
    *    whether use the fused calculate+select or just calculate the distances for each
    *    query and probed cluster.
    *
+   * @param locality_hint
+   *    beyond this limit do not consider increasing the number of active blocks per SM
+   *    would improve locality anymore.
    */
-  static inline auto select(bool manage_local_topk,
+  static inline auto select(const cudaDeviceProp& dev_props,
+                            bool manage_local_topk,
+                            int locality_hint,
+                            double preferred_shmem_carveout,
                             uint32_t pq_bits,
                             uint32_t pq_dim,
-                            uint32_t rot_dim,
-                            uint32_t preferred_thread_block_size,
+                            uint32_t precomp_data_count,
                             uint32_t n_queries,
                             uint32_t n_probes,
                             uint32_t topk) -> selected
   {
+    // Shared memory for storing the lookup table
+    size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits);
+    // Shared memory for storing pre-computed pieces to speedup the lookup table construction
+    // (e.g. the distance between a cluster center and the query for L2).
+    size_t bdf_mem = sizeof(float) * precomp_data_count;
+    // Shared memory for the fused top-k component; it may overlap with the other uses of shared
+    // memory and depends on the number of threads.
+    struct ltk_mem_t {
+      uint32_t subwarp_size;
+      uint32_t topk;
+      bool manage_local_topk;
+      ltk_mem_t(bool manage_local_topk, uint32_t topk)
+        : manage_local_topk(manage_local_topk), topk(topk)
+      {
+        subwarp_size = WarpSize;
+        while (topk * 2 <= subwarp_size) {
+          subwarp_size /= 2;
+        }
+      }
+
+      [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
+      {
+        return manage_local_topk ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
+                                     n_threads / subwarp_size, topk)
+                                 : 0;
+      }
+    } ltk_mem{manage_local_topk, topk};
+
+    // Total amount of work; should be enough to occupy the GPU.
+    uint32_t n_blocks = n_queries * n_probes;
+
+    // The minimum block size we may want:
+    //   1. It's a power-of-two for efficient L1 caching of pq_centers values
+    //      (multiples of `1 << pq_bits`).
+    //   2. It should be large enough to fully utilize an SM.
+    uint32_t n_threads_min = WarpSize;
+    while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
+           dev_props.maxThreadsPerMultiProcessor) {
+      n_threads_min *= 2;
+    }
+    // Further increase the minimum block size to make sure full device occupancy
+    // (NB: this may lead to `n_threads_min` being larger than the kernel's maximum)
+    while (int(n_blocks * n_threads_min) <
+             dev_props.multiProcessorCount * dev_props.maxThreadsPerMultiProcessor &&
+           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+      n_threads_min *= 2;
+    }
+    // Even further, increase it to allow less blocks per SM if there not enough queries.
+    // With this, we reduce the chance of different clusters being processed by two blocks
+    // on the same SM and thus improve the data locality for L1 caching.
+    while (int(n_queries * n_threads_min) < dev_props.maxThreadsPerMultiProcessor &&
+           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+      n_threads_min *= 2;
+    }
+
+    // Granularity of changing the number of threads when computing the maximum block size.
+    // It's good to have it multiple of the PQ book width.
+    uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
+
+    /*
+     Shared memory / L1 cache balance is the main limiter of this kernel.
+     The more blocks per SM we launch, the more shared memory we need. Besides that, we have
+     three versions of the kernel varying in performance and shmem usage.
+
+     We try the most demanding and the fastest kernel first, trying to maximize occupancy with
+     the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further
+     optimize occupancy and data locality for the L1 cache.
+     */
     using conf_fast        = configured<true, true>;
     using conf_no_basediff = configured<false, true>;
     using conf_no_smem_lut = configured<true, false>;
-
-    kernel_t kernel_fast = conf_fast::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
-    kernel_t kernel_no_basediff =
-      conf_no_basediff::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
-    kernel_t kernel_no_smem_lut =
-      conf_no_smem_lut::kernel(pq_bits, pq_dim, manage_local_topk ? topk : 0u);
-
-    const size_t smem_threshold = 48 * 1024;
-    size_t smem_size            = sizeof(LutT) * (pq_dim << pq_bits);
-    size_t smem_size_base_diff  = sizeof(float) * rot_dim;
-
-    uint32_t n_blocks  = n_queries * n_probes;
-    uint32_t n_threads = 1024;
-    // preferred_thread_block_size == 0 means using auto thread block size calculation mode
-    if (preferred_thread_block_size == 0) {
-      const uint32_t thread_min = 256;
-      int cur_dev;
-      cudaDeviceProp dev_props;
-      RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-      RAFT_CUDA_TRY(cudaGetDeviceProperties(&dev_props, cur_dev));
-      while (n_threads > thread_min) {
-        if (n_blocks < uint32_t(getMultiProcessorCount() * (1024 / (n_threads / 2)))) { break; }
-        if (dev_props.sharedMemPerMultiprocessor * 2 / 3 < smem_size * (1024 / (n_threads / 2))) {
-          break;
-        }
-        n_threads /= 2;
+    auto topk_or_zero      = manage_local_topk ? topk : 0u;
+    std::array candidates{
+      std::make_tuple(conf_fast::kernel(pq_bits, topk_or_zero), lut_mem + bdf_mem, true),
+      std::make_tuple(conf_no_basediff::kernel(pq_bits, topk_or_zero), lut_mem, true),
+      std::make_tuple(conf_no_smem_lut::kernel(pq_bits, topk_or_zero), bdf_mem, false)};
+
+    // we may allow slightly lower than 100% occupancy;
+    constexpr double kTargetOccupancy = 0.75;
+    // This struct is used to select the better candidate
+    occupancy_t selected_perf{};
+    selected selected_config;
+    for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
+      if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
+        // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate.
+        continue;
       }
-    } else {
-      n_threads = preferred_thread_block_size;
-    }
-    size_t smem_size_local_topk =
-      manage_local_topk
-        ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / WarpSize, topk)
-        : 0;
-    smem_size = max(smem_size, smem_size_local_topk);
-
-    kernel_t kernel = kernel_no_basediff;
-
-    bool kernel_no_basediff_available = true;
-    bool use_smem_lut                 = true;
-    if (smem_size > smem_threshold) {
-      cudaError_t cuda_status = cudaFuncSetAttribute(
-        kernel_no_basediff, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      // First, we set the carveout hint to the preferred value. The driver will increase this if
+      // needed to run at least one block per SM. At the same time, if more blocks fit into one SM,
+      // this carveout value will limit the calculated occupancy. When we're done selecting the best
+      // launch configuration, we will tighten the carveout once more, based on the final memory
+      // usage and occupancy.
+      const int max_carveout =
+        estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props);
+      RAFT_CUDA_TRY(
+        cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout));
+
+      // Get the theoretical maximum possible number of threads per block
+      cudaFuncAttributes kernel_attrs;
+      RAFT_CUDA_TRY(cudaFuncGetAttributes(&kernel_attrs, kernel));
+      uint32_t n_threads =
+        round_down_safe<uint32_t>(kernel_attrs.maxThreadsPerBlock, n_threads_gty);
+
+      // Actual required shmem depens on the number of threads
+      size_t smem_size = max(smem_size_const, ltk_mem(n_threads));
+
+      // Make sure the kernel can get enough shmem.
+      cudaError_t cuda_status =
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
       if (cuda_status != cudaSuccess) {
         RAFT_EXPECTS(
           cuda_status == cudaGetLastError(),
           "Tried to reset the expected cuda error code, but it didn't match the expectation");
-        kernel_no_basediff_available = false;
-
-        // Use "kernel_no_smem_lut" which just uses small amount of shared memory.
-        RAFT_LOG_DEBUG(
-          "Non-shared-mem look-up table kernel is selected, because it wouldn't fit shmem "
-          "required: "
-          "%zu bytes)",
-          smem_size);
-        kernel       = kernel_no_smem_lut;
-        use_smem_lut = false;
-        n_threads    = 1024;
-        smem_size_local_topk =
-          manage_local_topk
-            ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(n_threads / WarpSize, topk)
-            : 0;
-        smem_size = max(smem_size_base_diff, smem_size_local_topk);
-        n_blocks  = getMultiProcessorCount();
+        // Failed to request enough shmem for the kernel. Skip the candidate.
+        continue;
       }
-    }
-    if (kernel_no_basediff_available) {
-      bool kernel_fast_available = true;
-      if (smem_size + smem_size_base_diff > smem_threshold) {
-        cudaError_t cuda_status = cudaFuncSetAttribute(kernel_fast,
-                                                       cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                                       smem_size + smem_size_base_diff);
-        if (cuda_status != cudaSuccess) {
-          RAFT_EXPECTS(
-            cuda_status == cudaGetLastError(),
-            "Tried to reset the expected cuda error code, but it didn't match the expectation");
-          kernel_fast_available = false;
-          RAFT_LOG_DEBUG(
-            "No-precomputed-basediff kernel is selected, because the basediff wouldn't fit (shmem "
-            "required: %zu bytes)",
-            smem_size + smem_size_base_diff);
+
+      occupancy_t cur(smem_size, n_threads, kernel, dev_props);
+      if (cur.blocks_per_sm <= 0) {
+        // For some reason, we still cannot make this kernel run. Skip the candidate.
+        continue;
+      }
+
+      {
+        // Try to reduce the number of threads to increase occupancy and data locality
+        auto n_threads_tmp = n_threads_min;
+        while (n_threads_tmp * 2 < n_threads) {
+          n_threads_tmp *= 2;
+        }
+        if (n_threads_tmp < n_threads) {
+          while (n_threads_tmp >= n_threads_min) {
+            auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
+            occupancy_t tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props);
+            bool select_it = false;
+            if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) {
+              // Normally, the smaller the block the better for L1 cache hit rate.
+              // Hence, the occupancy should be "just good enough"
+              select_it = tmp.occupancy >= min(kTargetOccupancy, cur.occupancy);
+            } else if (lut_is_in_shmem) {
+              // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
+              // the locality is not going to improve with increasing the number of blocks per SM.
+              // Hence, the only metric here is the occupancy.
+              select_it = tmp.occupancy > cur.occupancy;
+            } else {
+              // If we don't use shared memory for the lookup table, increasing the number of blocks
+              // is very taxing on the global memory usage.
+              // In this case, the occupancy must increase a lot to make it worth the cost.
+              select_it = tmp.occupancy >= min(1.0, cur.occupancy / kTargetOccupancy);
+            }
+            if (select_it) {
+              n_threads = n_threads_tmp;
+              smem_size = smem_size_tmp;
+              cur       = tmp;
+            }
+            n_threads_tmp /= 2;
+          }
         }
       }
-      if (kernel_fast_available) {
-        int kernel_no_basediff_n_blocks = 0;
-        RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &kernel_no_basediff_n_blocks, kernel_no_basediff, n_threads, smem_size));
-
-        int kernel_fast_n_blocks = 0;
-        RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &kernel_fast_n_blocks, kernel_fast, n_threads, smem_size + smem_size_base_diff));
-
-        // Use "kernel_fast" only if GPU occupancy does not drop
-        if (kernel_no_basediff_n_blocks == kernel_fast_n_blocks) {
-          kernel = kernel_fast;
-          smem_size += smem_size_base_diff;
+
+      {
+        if (selected_perf.occupancy <= 0.0  // no candidate yet
+            || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
+                selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
+        ) {
+          selected_perf = cur;
+          if (lut_is_in_shmem) {
+            selected_config = {
+              kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
+          } else {
+            // When the global memory is used for the lookup table, we need to minimize the grid
+            // size; otherwise, the kernel may quickly run out of memory.
+            auto n_blocks_min =
+              std::min<uint32_t>(n_blocks, cur.blocks_per_sm * dev_props.multiProcessorCount);
+            selected_config = {kernel,
+                               dim3(n_blocks_min, 1, 1),
+                               dim3(n_threads, 1, 1),
+                               smem_size,
+                               size_t(n_blocks_min) * size_t(pq_dim << pq_bits)};
+          }
+          // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
+          // a rather conservative bar; most likely, the kernel gets more shared memory than this,
+          // and the occupancy doesn't get hurt.
+          auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * cur.shmem_use));
+          RAFT_CUDA_TRY(
+            cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+          if (cur.occupancy >= kTargetOccupancy) { break; }
+        } else if (selected_perf.occupancy > 0.0) {
+          // If we found a reasonable candidate on a previous iteration, and this one is not better,
+          // then don't try any more candidates because they are much slower anyway.
+          break;
         }
       }
     }
 
-    uint32_t device_lut_size = use_smem_lut ? 0u : n_blocks * (pq_dim << pq_bits);
-    return {reinterpret_cast<void*>(kernel),
-            dim3(n_blocks, 1, 1),
-            dim3(n_threads, 1, 1),
-            smem_size,
-            device_lut_size};
+    RAFT_EXPECTS(selected_perf.occupancy > 0.0,
+                 "Couldn't determine a working kernel launch configuration.");
+
+    return selected_config;
   }
 };
 
+inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
+{
+  if (k > kMaxCapacity) { return false; }             // warp_sort not possible
+  if (n_probes <= 16) { return false; }               // too few clusters
+  if (n_queries * n_probes <= 256) { return false; }  // overall amount of work is too small
+  return true;
+}
+
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -1009,13 +1254,13 @@ void ivfpq_search_worker(const handle_t& handle,
                          uint32_t max_samples,
                          uint32_t n_probes,
                          uint32_t topK,
-                         uint32_t preferred_thread_block_size,
                          uint32_t n_queries,
                          const uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                          const float* query,                 // [n_queries, rot_dim]
                          IdxT* neighbors,                    // [n_queries, topK]
                          float* distances,                   // [n_queries, topK]
                          float scaling_factor,
+                         double preferred_shmem_carveout,
                          rmm::mr::device_memory_resource* mr)
 {
   auto stream = handle.get_stream();
@@ -1025,13 +1270,10 @@ void ivfpq_search_worker(const handle_t& handle,
   auto data_indices    = index.indices().data_handle();
   auto cluster_centers = index.centers_rot().data_handle();
   auto cluster_offsets = index.list_offsets().data_handle();
+  auto cluster_sizes   = index.list_sizes().data_handle();
 
-  bool manage_local_topk = topK <= kMaxCapacity  // depth is not too large
-                           && n_probes >= 16     // not too few clusters looked up
-                           &&
-                           n_queries * n_probes >= 256  // overall amount of work is not too small
-    ;
-  auto topk_len = manage_local_topk ? n_probes * topK : max_samples;
+  bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries);
+  auto topk_len          = manage_local_topk ? n_probes * topK : max_samples;
   if (manage_local_topk) {
     RAFT_LOG_DEBUG("Fused version of the search kernel is selected (manage_local_topk == true)");
   } else {
@@ -1052,10 +1294,12 @@ void ivfpq_search_worker(const handle_t& handle,
     neighbors_ptr = neighbors_buf.data();
   }
 
-  calc_chunk_indices<IdxT>::configure(n_probes, n_queries)(
-    cluster_offsets, clusters_to_probe, chunk_index.data(), num_samples.data(), stream);
+  calc_chunk_indices::configure(n_probes, n_queries)(
+    cluster_sizes, clusters_to_probe, chunk_index.data(), num_samples.data(), stream);
 
-  if (n_queries * n_probes > 256) {
+  auto coresidency = expected_probe_coresidency(index.n_lists(), n_probes, n_queries);
+
+  if (coresidency > 1) {
     // Sorting index by cluster number (label).
     // The goal is to incrase the L2 cache hit rate to read the vectors
     // of a cluster by processing the cluster at the same time as much as
@@ -1096,22 +1340,49 @@ void ivfpq_search_worker(const handle_t& handle,
   }
 
   // select and run the main search kernel
+  uint32_t precomp_data_count = 0;
+  switch (index.metric()) {
+    case distance::DistanceType::L2SqrtExpanded:
+    case distance::DistanceType::L2SqrtUnexpanded:
+    case distance::DistanceType::L2Unexpanded:
+    case distance::DistanceType::L2Expanded: {
+      // stores basediff (query[i] - center[i])
+      precomp_data_count = index.rot_dim();
+    } break;
+    case distance::DistanceType::InnerProduct: {
+      // stores two components (query[i] * center[i], query[i] * center[i])
+      precomp_data_count = index.rot_dim() * 2;
+    } break;
+    default: {
+      RAFT_FAIL("Unsupported metric");
+    } break;
+  }
+
   auto search_instance =
-    ivfpq_compute_similarity<IdxT, ScoreT, LutT>::select(manage_local_topk,
+    ivfpq_compute_similarity<IdxT, ScoreT, LutT>::select(handle.get_device_properties(),
+                                                         manage_local_topk,
+                                                         coresidency,
+                                                         preferred_shmem_carveout,
                                                          index.pq_bits(),
                                                          index.pq_dim(),
-                                                         index.rot_dim(),
-                                                         preferred_thread_block_size,
+                                                         precomp_data_count,
                                                          n_queries,
                                                          n_probes,
                                                          topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
+  rmm::device_uvector<float> query_kths(0, stream, mr);
+  if (manage_local_topk) {
+    query_kths.resize(n_queries, stream);
+    thrust::fill_n(handle.get_thrust_policy(),
+                   query_kths.data(),
+                   n_queries,
+                   float(dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy));
+  }
   search_instance(stream,
                   index.size(),
                   index.rot_dim(),
                   n_probes,
-                  index.pq_bits(),
                   index.pq_dim(),
                   n_queries,
                   index.metric(),
@@ -1125,6 +1396,7 @@ void ivfpq_search_worker(const handle_t& handle,
                   chunk_index.data(),
                   query,
                   index_list_sorted,
+                  query_kths.data(),
                   device_lut.data(),
                   distances_buf.data(),
                   neighbors_ptr);
@@ -1164,19 +1436,7 @@ void ivfpq_search_worker(const handle_t& handle,
 template <typename IdxT>
 struct ivfpq_search {
  public:
-  using fun_t = void (*)(const handle_t&,
-                         const ivf_pq::index<IdxT>&,
-                         uint32_t,
-                         uint32_t,
-                         uint32_t,
-                         uint32_t,
-                         uint32_t,
-                         const uint32_t*,
-                         const float*,
-                         IdxT*,
-                         float*,
-                         float,
-                         rmm::mr::device_memory_resource*);
+  using fun_t = decltype(&ivfpq_search_worker<float, float, IdxT>);
 
   /**
    * Select an instance of the ivf-pq search function based on search tuning parameters,
@@ -1226,12 +1486,18 @@ struct ivfpq_search {
  * A heuristic for bounding the number of queries per batch, to improve GPU utilization.
  * (based on the number of SMs and the work size).
  *
+ * @param k top-k
+ * @param n_probes number of selected clusters per query
  * @param n_queries number of queries hoped to be processed at once.
  *                  (maximum value for the returned batch size)
+ * @param max_samples maximum possible number of samples to be processed for the given `n_probes`
  *
  * @return maximum recommended batch size.
  */
-inline auto get_max_batch_size(uint32_t n_queries) -> uint32_t
+inline auto get_max_batch_size(uint32_t k,
+                               uint32_t n_probes,
+                               uint32_t n_queries,
+                               uint32_t max_samples) -> uint32_t
 {
   uint32_t max_batch_size         = n_queries;
   uint32_t n_ctas_total           = getMultiProcessorCount() * 2;
@@ -1243,6 +1509,23 @@ inline auto get_max_batch_size(uint32_t n_queries) -> uint32_t
     float utilization_1 = float(n_ctas_total_per_batch_1 * max_batch_size_1) / n_ctas_total;
     if (utilization < utilization_1) { max_batch_size = max_batch_size_1; }
   }
+  // Check in the tmp distance buffer is not too big
+  auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t {
+    return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs;
+  };
+  constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024;
+  if (ws_size(max_batch_size) > kMaxWsSize) {
+    uint32_t smaller_batch_size = 1;
+    // take powers of two for better alignment
+    while (smaller_batch_size * 2 <= max_batch_size) {
+      smaller_batch_size <<= 1;
+    }
+    // gradually reduce the batch size until we fit into the max size limit.
+    while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) {
+      smaller_batch_size >>= 1;
+    }
+    return smaller_batch_size;
+  }
   return max_batch_size;
 }
 
@@ -1261,7 +1544,11 @@ inline void search(const handle_t& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported element type.");
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_pq::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+    "ivf_pq::search(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
+    n_queries,
+    params.n_probes,
+    k,
+    index.dim());
 
   RAFT_EXPECTS(
     params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F,
@@ -1269,11 +1556,6 @@ inline void search(const handle_t& handle,
   RAFT_EXPECTS(params.lut_dtype == CUDA_R_16F || params.lut_dtype == CUDA_R_32F ||
                  params.lut_dtype == CUDA_R_8U,
                "lut_dtype must be CUDA_R_16F, CUDA_R_32F or CUDA_R_8U");
-  RAFT_EXPECTS(
-    params.preferred_thread_block_size == 256 || params.preferred_thread_block_size == 512 ||
-      params.preferred_thread_block_size == 1024 || params.preferred_thread_block_size == 0,
-    "preferred_thread_block_size must be 0, 256, 512 or 1024, but %u is given.",
-    params.preferred_thread_block_size);
   RAFT_EXPECTS(k > 0, "parameter `k` in top-k must be positive.");
   RAFT_EXPECTS(
     k <= index.size(),
@@ -1326,7 +1608,7 @@ inline void search(const handle_t& handle,
 
   // Maximum number of query vectors to search at the same time.
   const auto max_queries = std::min<uint32_t>(std::max<uint32_t>(n_queries, 1), 4096);
-  auto max_batch_size    = get_max_batch_size(max_queries);
+  auto max_batch_size    = get_max_batch_size(k, n_probes, max_queries, max_samples);
 
   rmm::device_uvector<float> float_queries(max_queries * dim_ext, stream, mr);
   rmm::device_uvector<float> rot_queries(max_queries * index.rot_dim(), stream, mr);
@@ -1380,13 +1662,13 @@ inline void search(const handle_t& handle,
                       max_samples,
                       params.n_probes,
                       k,
-                      params.preferred_thread_block_size,
                       batch_size,
                       clusters_to_probe.data() + uint64_t(params.n_probes) * offset_b,
                       rot_queries.data() + uint64_t(index.rot_dim()) * offset_b,
                       neighbors + uint64_t(k) * (offset_q + offset_b),
                       distances + uint64_t(k) * (offset_q + offset_b),
                       utils::config<T>::kDivisor / utils::config<float>::kDivisor,
+                      params.preferred_shmem_carveout,
                       mr);
     }
   }
diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/spatial/knn/detail/topk.cuh
index 5adf6df472..f4dcb53088 100644
--- a/cpp/include/raft/spatial/knn/detail/topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk.cuh
@@ -19,6 +19,8 @@
 #include "topk/radix_topk.cuh"
 #include "topk/warpsort_topk.cuh"
 
+#include <raft/core/nvtx.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -73,7 +75,11 @@ void select_topk(const T* in,
                  rmm::cuda_stream_view stream,
                  rmm::mr::device_memory_resource* mr = nullptr)
 {
-  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity) {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "matrix::select_topk(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+  // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
+  const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
+  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity && !radix_faster) {
     topk::warp_sort_topk<T, IdxT>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
   } else {
diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
index 40ac7b0b92..630acab2b8 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
@@ -94,7 +94,7 @@ class bitonic {
    * You can think of this function in two ways:
    *
    *   1) Sort any bitonic sequence.
-   *   2) Merge two halfs of the input data assuming they're already sorted, and their order is
+   *   2) Merge two halves of the input data assuming they're already sorted, and their order is
    *      opposite (i.e. either ascending, descending or vice-versa).
    *
    * The input pointers are unique per-thread.
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 84cc072620..cbe9f36e97 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -33,7 +33,7 @@
   Three APIs of different scopes are provided:
     1. host function: warp_sort_topk()
     2. block-wide API: class block_sort
-    3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
+    3. warp-wide API: several implementations of warp_sort_*
 
 
   1. warp_sort_topk()
@@ -42,7 +42,7 @@
   2. class block_sort
     It can be regarded as a fixed size priority queue for a thread block,
     although the API is not typical.
-    class warp_sort_filtered and warp_sort_immediate can be used to instantiate block_sort.
+    one of the classes `warp_sort_*` can be used to instantiate block_sort.
 
     It uses dynamic shared memory as an intermediate buffer.
     So the required shared memory size should be calculated using
@@ -70,7 +70,7 @@
      kernel<<<grid_dim, block_dim, smem_size>>>();
 
 
-  3. class warp_sort_filtered and class warp_sort_immediate
+  3. class warp_sort_*
     These two classes can be regarded as fixed size priority queue for a warp.
     Usage is similar to class block_sort. No shared memory is needed.
 
@@ -139,7 +139,7 @@ class warp_sort {
 
  public:
   /**
-   *  The `empty` value for the choosen binary operation,
+   *  The `empty` value for the chosen binary operation,
    *  i.e. `Ascending ? upper_bound<T>() : lower_bound<T>()`.
    */
   static constexpr T kDummy = Ascending ? upper_bound<T>() : lower_bound<T>();
@@ -168,7 +168,7 @@ class warp_sort {
    *
    * When it actually loads the values, it always performs some collective warp operations in the
    * end, thus enforcing warp sync. This means, it's safe to call `store` with the same arguments
-   * after `load_sorted` without extra sync. Note, however, that this is not neccesarily true for
+   * after `load_sorted` without extra sync. Note, however, that this is not necessarily true for
    * the reverse order, because the access patterns of `store` and `load_sorted` are different.
    *
    * @param[in] in
@@ -276,8 +276,8 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
 
-  __device__ warp_sort_filtered(int k)
-    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(kDummy)
+  __device__ warp_sort_filtered(int k, T limit)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(limit)
   {
 #pragma unroll
     for (int i = 0; i < kMaxBufLen; i++) {
@@ -286,6 +286,11 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
+  __device__ __forceinline__ explicit warp_sort_filtered(int k)
+    : warp_sort_filtered<Capacity, Ascending, T, IdxT>(k, kDummy)
+  {
+  }
+
   __device__ void add(T val, IdxT idx)
   {
     // comparing for k_th should reduce the total amount of updates:
@@ -356,6 +361,108 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   T k_th_;
 };
 
+/**
+ * This version of warp_sort compares each input element against the current
+ * estimate of k-th value before adding it to the intermediate sorting buffer.
+ * In contrast to `warp_sort_filtered`, it keeps one distributed buffer for
+ * all threads in a warp (independently of the subwarp size), which makes its flushing less often.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+
+  __device__ warp_sort_distributed(int k, T limit)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k),
+      buf_val_(kDummy),
+      buf_idx_(IdxT{}),
+      buf_len_(0),
+      k_th_(limit)
+  {
+  }
+
+  __device__ __forceinline__ explicit warp_sort_distributed(int k)
+    : warp_sort_distributed<Capacity, Ascending, T, IdxT>(k, kDummy)
+  {
+  }
+
+  __device__ void add(T val, IdxT idx)
+  {
+    // mask tells which lanes in the warp have valid items to be added
+    uint32_t mask = ballot(is_ordered<Ascending>(val, k_th_));
+    if (mask == 0) { return; }
+    // how many elements to be added
+    uint32_t n_valid = __popc(mask);
+    // index of the source lane containing the value to put into the current lane.
+    uint32_t src_ix = 0;
+    // remove a few smallest set bits from the mask.
+    for (uint32_t i = std::min(n_valid, Pow2<WarpSize>::mod(uint32_t(laneId()) - buf_len_)); i > 0;
+         i--) {
+      src_ix = __ffs(mask) - 1;
+      mask ^= (0x1u << src_ix);
+    }
+    // now the least significant bit of the mask corresponds to the lane id we want to get.
+    // for not-added (invalid) indices, the mask is zeroed by now.
+    src_ix = __ffs(mask) - 1;
+    // rearrange the inputs to be ready to put them into the tmp buffer
+    val = shfl(val, src_ix);
+    idx = shfl(idx, src_ix);
+    // for non-valid lanes, src_ix should be uint(-1)
+    if (mask == 0) { val = kDummy; }
+    // save the values into the free slots of the warp tmp buffer
+    if (laneId() >= buf_len_) {
+      buf_val_ = val;
+      buf_idx_ = idx;
+    }
+    buf_len_ += n_valid;
+    if (buf_len_ < WarpSize) { return; }
+    // merge the warp tmp buffer into the queue
+    merge_buf_();
+    buf_len_ -= WarpSize;
+    // save the inputs that couldn't fit before the merge
+    if (laneId() < buf_len_) {
+      buf_val_ = val;
+      buf_idx_ = idx;
+    }
+  }
+
+  __device__ void done()
+  {
+    if (buf_len_ != 0) {
+      merge_buf_();
+      buf_len_ = 0;
+    }
+  }
+
+ private:
+  __device__ __forceinline__ void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
+  }
+
+  __device__ __forceinline__ void merge_buf_()
+  {
+    topk::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
+    this->merge_in<1>(&buf_val_, &buf_idx_);
+    set_k_th_();  // contains warp sync
+    buf_val_ = kDummy;
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+
+  T buf_val_;
+  IdxT buf_idx_;
+  uint32_t buf_len_;  // 0 <= buf_len_ <= WarpSize
+
+  T k_th_;
+};
+
 /**
  * This version of warp_sort adds every input element into the intermediate sorting
  * buffer, and thus does the sorting step every `Capacity` input elements.
@@ -436,7 +543,8 @@ class block_sort {
  public:
   using queue_t = WarpSortWarpWide<Capacity, Ascending, T, IdxT>;
 
-  __device__ block_sort(int k, uint8_t* smem_buf) : queue_(k)
+  template <typename... Args>
+  __device__ block_sort(int k, uint8_t* smem_buf, Args... args) : queue_(k, args...)
   {
     val_smem_             = reinterpret_cast<T*>(smem_buf);
     const int num_of_warp = subwarp_align::div(blockDim.x);
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
index 39b55e315e..b6ffbd5122 100644
--- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
@@ -75,7 +75,7 @@ struct KeyValuePair {
 // template specialization / expansion and constexpr, and it uses warp
 // shuffles to exchange values between warp lanes.
 //
-// A note about comparsions:
+// A note about comparisons:
 //
 // For a sorting network of keys only, we only need one
 // comparison (a < b). However, what we really need to know is
@@ -111,7 +111,7 @@ struct KeyValuePair {
 // I have tried both re-arranging the order in the higher lane to get
 // away with one comparison or adding the value to the check; both
 // result in greater register consumption or lower speed than just
-// perfoming both < and > comparisons with the variables, so I just
+// performing both < and > comparisons with the variables, so I just
 // stick with this.
 
 // This function merges kWarpSize / 2L lists in parallel using warp
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index e4e028e9f0..f225438841 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -65,7 +65,7 @@ enum struct sparse_mv_alg_t : int {
   SPARSE_MV_UNDEFINED = -1,
   SPARSE_MV_ALG_DEFAULT,  // generic, for any sparse matrix
   SPARSE_MV_ALG1,         // typical for CSR
-  SPARSE_MV_ALG2          // may provide better performamce for irregular sparse matrices
+  SPARSE_MV_ALG2          // may provide better performance for irregular sparse matrices
 };
 
 // Vector "view"-like aggregate for linear algebra purposes
diff --git a/cpp/include/raft/spectral/eigen_solvers.cuh b/cpp/include/raft/spectral/eigen_solvers.cuh
index 787a5bde39..88e4abe513 100644
--- a/cpp/include/raft/spectral/eigen_solvers.cuh
+++ b/cpp/include/raft/spectral/eigen_solvers.cuh
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/linalg/lanczos.cuh>
+#include <raft/sparse/solver/lanczos.cuh>
 #include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
@@ -58,17 +58,17 @@ struct lanczos_solver_t {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    linalg::computeSmallestEigenvectors(handle,
-                                        A,
-                                        config_.n_eigVecs,
-                                        config_.maxIter,
-                                        config_.restartIter,
-                                        config_.tol,
-                                        config_.reorthogonalize,
-                                        iters,
-                                        eigVals,
-                                        eigVecs,
-                                        config_.seed);
+    sparse::solver::computeSmallestEigenvectors(handle,
+                                                A,
+                                                config_.n_eigVecs,
+                                                config_.maxIter,
+                                                config_.restartIter,
+                                                config_.tol,
+                                                config_.reorthogonalize,
+                                                iters,
+                                                eigVals,
+                                                eigVecs,
+                                                config_.seed);
     return iters;
   }
 
@@ -81,17 +81,17 @@ struct lanczos_solver_t {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    linalg::computeLargestEigenvectors(handle,
-                                       A,
-                                       config_.n_eigVecs,
-                                       config_.maxIter,
-                                       config_.restartIter,
-                                       config_.tol,
-                                       config_.reorthogonalize,
-                                       iters,
-                                       eigVals,
-                                       eigVecs,
-                                       config_.seed);
+    sparse::solver::computeLargestEigenvectors(handle,
+                                               A,
+                                               config_.n_eigVecs,
+                                               config_.maxIter,
+                                               config_.restartIter,
+                                               config_.tol,
+                                               config_.reorthogonalize,
+                                               iters,
+                                               eigVals,
+                                               eigVecs,
+                                               config_.seed);
     return iters;
   }
 
@@ -104,4 +104,4 @@ struct lanczos_solver_t {
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index 10dedc44eb..f36d95daff 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -91,7 +91,7 @@ size_t getContingencyMatrixWorkspaceSize(int nSamples,
 }
 
 /**
- * @brief contruct contingency matrix given input ground truth and prediction
+ * @brief construct contingency matrix given input ground truth and prediction
  *        labels. Users should call function getInputClassCardinality to find
  *        and allocate memory for output. Similarly workspace requirements
  *        should be checked using function getContingencyMatrixWorkspaceSize
@@ -130,7 +130,7 @@ void contingencyMatrix(const T* groundTruth,
 }
 
 /**
- * @brief contruct contingency matrix given input ground truth and prediction
+ * @brief construct contingency matrix given input ground truth and prediction
  *        labels. Users should call function getInputClassCardinality to find
  *        and allocate memory for output. Similarly workspace requirements
  *        should be checked using function getContingencyMatrixWorkspaceSize
@@ -138,21 +138,31 @@ void contingencyMatrix(const T* groundTruth,
  * @tparam out_t output matrix type
  * @tparam idx_t Index type of matrix extent.
  * @tparam layout_t Layout type of the input data.
+ * @tparam opt_min_label_t std::optional<value_t> @c opt_min_label
+ * @tparam opt_max_label_t std::optional<value_t> @c opt_max_label
  * @param[in]  handle: the raft handle.
  * @param[in]  ground_truth: device 1-d array for ground truth (num of rows)
  * @param[in]  predicted_label: device 1-d array for prediction (num of columns)
  * @param[out] out_mat: output buffer for contingency matrix
- * @param[in]  min_label: Optional, min value in input ground truth array
- * @param[in]  max_label: Optional, max value in input ground truth array
+ * @param[in]  opt_min_label: std::optional, min value in input ground truth array
+ * @param[in]  opt_max_label: std::optional, max value in input ground truth array
  */
-template <typename value_t, typename out_t, typename idx_t, typename layout_t>
+template <typename value_t,
+          typename out_t,
+          typename idx_t,
+          typename layout_t,
+          typename opt_min_label_t,
+          typename opt_max_label_t>
 void contingency_matrix(const raft::handle_t& handle,
                         raft::device_vector_view<const value_t, idx_t> ground_truth,
                         raft::device_vector_view<const value_t, idx_t> predicted_label,
                         raft::device_matrix_view<out_t, idx_t, layout_t> out_mat,
-                        std::optional<value_t> min_label = std::nullopt,
-                        std::optional<value_t> max_label = std::nullopt)
+                        opt_min_label_t&& opt_min_label,
+                        opt_max_label_t&& opt_max_label)
 {
+  std::optional<value_t> min_label = std::forward<opt_min_label_t>(opt_min_label);
+  std::optional<value_t> max_label = std::forward<opt_max_label_t>(opt_max_label);
+
   RAFT_EXPECTS(ground_truth.size() == predicted_label.size(), "Size mismatch");
   RAFT_EXPECTS(ground_truth.is_exhaustive(), "ground_truth must be contiguous");
   RAFT_EXPECTS(predicted_label.is_exhaustive(), "predicted_label must be contiguous");
@@ -188,22 +198,10 @@ void contingency_matrix(const raft::handle_t& handle,
  *
  * Please see above for documentation of `contingency_matrix`.
  */
-template <typename value_t,
-          typename out_t,
-          typename idx_t,
-          typename layout_t,
-          typename opt_min_label_t,
-          typename opt_max_label_t>
-void contingency_matrix(const raft::handle_t& handle,
-                        raft::device_vector_view<const value_t, idx_t> ground_truth,
-                        raft::device_vector_view<const value_t, idx_t> predicted_label,
-                        raft::device_matrix_view<out_t, idx_t, layout_t> out_mat,
-                        opt_min_label_t&& min_label = std::nullopt,
-                        opt_max_label_t&& max_label = std::nullopt)
+template <typename... Args, typename = std::enable_if_t<sizeof...(Args) == 4>>
+void contingency_matrix(Args... args)
 {
-  std::optional<value_t> opt_min_label = std::forward<opt_min_label_t>(min_label);
-  std::optional<value_t> opt_max_label = std::forward<opt_max_label_t>(max_label);
-  contingency_matrix(handle, ground_truth, predicted_label, out_mat, opt_min_label, opt_max_label);
+  contingency_matrix(std::forward<Args>(args)..., std::nullopt, std::nullopt);
 }
 };  // namespace stats
 };  // namespace raft
diff --git a/cpp/include/raft/stats/detail/batched/information_criterion.cuh b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
index 1590910594..9807de8018 100644
--- a/cpp/include/raft/stats/detail/batched/information_criterion.cuh
+++ b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/linalg/unary_op.cuh>
-#include <raft/stats/common.hpp>
+#include <raft/stats/stats_types.hpp>
 
 #include <cmath>
 
diff --git a/cpp/include/raft/stats/detail/contingencyMatrix.cuh b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
index 27dcb96247..0fe9b6a092 100644
--- a/cpp/include/raft/stats/detail/contingencyMatrix.cuh
+++ b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
@@ -236,7 +236,7 @@ size_t getContingencyMatrixWorkspaceSize(int nSamples,
 }
 
 /**
- * @brief contruct contingency matrix given input ground truth and prediction
+ * @brief construct contingency matrix given input ground truth and prediction
  *        labels. Users should call function getInputClassCardinality to find
  *        and allocate memory for output. Similarly workspace requirements
  *        should be checked using function getContingencyMatrixWorkspaceSize
@@ -272,7 +272,7 @@ void contingencyMatrix(const T* groundTruth,
   // Output matrix will still have empty rows for label value {3,4}
   // Users can use "make_monotonic" to convert their discontinuous input label
   // range to a monotonically increasing one  //
-  // this also serves as way to measure co-occurence/joint counts for NLP tasks which
+  // this also serves as way to measure co-occurrence/joint counts for NLP tasks which
   // can be used to then compute pointwise mutual information and mutual information
   if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
     getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
index 69bd721ded..8fae2ec7e4 100644
--- a/cpp/include/raft/stats/detail/histogram.cuh
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <raft/common/seive.hpp>
-#include <raft/stats/common.hpp>
+#include <raft/stats/stats_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/seive.hpp>
 #include <raft/util/vectorized.cuh>
 #include <stdint.h>
 
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
index fb454ee6ad..10bc0f5599 100644
--- a/cpp/include/raft/stats/detail/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -48,7 +48,7 @@ namespace detail {
  * cluster array
  * @param numUniqueClasses: number of unique classes
  * @param size: the size of array a and b (size of the contingency matrix is (size x size))
- * @param d_MI: pointer to the device memory that stores the aggreggate mutual information
+ * @param d_MI: pointer to the device memory that stores the aggregate mutual information
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
 __global__ void mutual_info_kernel(const int* dContingencyMatrix,
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index cfaff5fcce..076d9b13e5 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -140,7 +140,7 @@ void countLabels(const LabelT* labels,
 }
 
 /**
- * @brief stucture that defines the division Lambda for elementwise op
+ * @brief structure that defines the division Lambda for elementwise op
  */
 template <typename DataT>
 struct DivOp {
@@ -154,8 +154,8 @@ struct DivOp {
 };
 
 /**
- * @brief stucture that defines the elementwise operation to calculate silhouette score using params
- * 'a' and 'b'
+ * @brief structure that defines the elementwise operation to calculate silhouette score using
+ * params 'a' and 'b'
  */
 template <typename DataT>
 struct SilOp {
@@ -173,7 +173,7 @@ struct SilOp {
 };
 
 /**
- * @brief stucture that defines the reduction Lambda to find minimum between elements
+ * @brief structure that defines the reduction Lambda to find minimum between elements
  */
 template <typename DataT>
 struct MinOp {
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index 8efb2e8df8..4ad5de0926 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -20,8 +20,8 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/stats/common.hpp>
 #include <raft/stats/detail/histogram.cuh>
+#include <raft/stats/stats_types.hpp>
 
 // This file is a shameless amalgamation of independent works done by
 // Lars Nyland and Andy Adinets
@@ -109,4 +109,4 @@ void histogram(const raft::handle_t& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
index 8ab4723d01..0edeed7f0b 100644
--- a/cpp/include/raft/stats/information_criterion.cuh
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -31,8 +31,8 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
-#include <raft/stats/common.hpp>
 #include <raft/stats/detail/batched/information_criterion.cuh>
+#include <raft/stats/stats_types.hpp>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index d5913e6176..5a39e29a8c 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -73,7 +73,7 @@ void mean(const raft::handle_t& handle,
   static_assert(
     std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
     "Data layout not supported");
-  RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch betwen data and mu");
+  RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
   detail::mean(mu.data_handle(),
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index fba2aa5b5a..9f49ff8be2 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -75,7 +75,7 @@ void mean_center(const raft::handle_t& handle,
     "Data layout not supported");
   auto mean_vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
   RAFT_EXPECTS(out.extents() == data.extents(), "Size mismatch");
-  RAFT_EXPECTS(mean_vec_size == mu.extent(0), "Size mismatch betwen data and mu");
+  RAFT_EXPECTS(mean_vec_size == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(out.is_exhaustive(), "out must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
   detail::meanCenter<value_t, idx_t>(out.data_handle(),
@@ -139,7 +139,7 @@ void mean_add(const raft::handle_t& handle,
     "Data layout not supported");
   auto mean_vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
   RAFT_EXPECTS(out.extents() == data.extents(), "Size mismatch");
-  RAFT_EXPECTS(mean_vec_size == mu.extent(0), "Size mismatch betwen data and mu");
+  RAFT_EXPECTS(mean_vec_size == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(out.is_exhaustive(), "out must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
   detail::meanAdd<value_t, idx_t>(out.data_handle(),
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index 544aed092d..fab2184637 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -84,8 +84,8 @@ void meanvar(const raft::handle_t& handle,
   static_assert(
     std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
     "Data layout not supported");
-  RAFT_EXPECTS(data.extent(1) == var.extent(0), "Size mismatch betwen data and var");
-  RAFT_EXPECTS(mean.size() == var.size(), "Size mismatch betwen mean and var");
+  RAFT_EXPECTS(data.extent(1) == var.extent(0), "Size mismatch between data and var");
+  RAFT_EXPECTS(mean.size() == var.size(), "Size mismatch between mean and var");
   RAFT_EXPECTS(mean.is_exhaustive(), "mean must be contiguous");
   RAFT_EXPECTS(var.is_exhaustive(), "var must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index 305e63cc10..a3cbec08fe 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -117,8 +117,8 @@ void minmax(const raft::handle_t& handle,
     ncols = colids.value().extent(0);
   }
   if (sampledcols.has_value()) { sampledcols_ptr = sampledcols.value().data_handle(); }
-  RAFT_EXPECTS(globalmin.extent(0) == ncols, "Size mismatch betwen globalmin and ncols");
-  RAFT_EXPECTS(globalmax.extent(0) == ncols, "Size mismatch betwen globalmax and ncols");
+  RAFT_EXPECTS(globalmin.extent(0) == ncols, "Size mismatch between globalmin and ncols");
+  RAFT_EXPECTS(globalmax.extent(0) == ncols, "Size mismatch between globalmax and ncols");
   detail::minmax<value_t>(data.data_handle(),
                           rowids_ptr,
                           colids_ptr,
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index e953f12461..6c7f588050 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -67,7 +67,7 @@ double mutual_info_score(const raft::handle_t& handle,
                          value_t upper_label_range)
 {
   RAFT_EXPECTS(first_cluster_array.extent(0) == second_cluster_array.extent(0),
-               "Size mismatch betwen first_cluster_array and second_cluster_array");
+               "Size mismatch between first_cluster_array and second_cluster_array");
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
   return detail::mutual_info_score(first_cluster_array.data_handle(),
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index e7fcdb6a4e..5b14c901de 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -68,7 +68,7 @@ value_t r2_score(const raft::handle_t& handle,
                  raft::device_vector_view<const value_t, idx_t> y,
                  raft::device_vector_view<const value_t, idx_t> y_hat)
 {
-  RAFT_EXPECTS(y.extent(0) == y_hat.extent(0), "Size mismatch betwen y and y_hat");
+  RAFT_EXPECTS(y.extent(0) == y_hat.extent(0), "Size mismatch between y and y_hat");
   RAFT_EXPECTS(y.is_exhaustive(), "y must be contiguous");
   RAFT_EXPECTS(y_hat.is_exhaustive(), "y_hat must be contiguous");
 
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index 72ad53f5d9..70384412a8 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -55,7 +55,7 @@ double rand_index(const raft::handle_t& handle,
                   raft::device_vector_view<const value_t, idx_t> second_cluster_array)
 {
   RAFT_EXPECTS(first_cluster_array.extent(0) == second_cluster_array.extent(0),
-               "Size mismatch betwen first_cluster_array and second_cluster_array");
+               "Size mismatch between first_cluster_array and second_cluster_array");
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
   return detail::compute_rand_index(first_cluster_array.data_handle(),
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index fd33f2af49..268440892c 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -76,7 +76,7 @@ void regression_metrics(const raft::handle_t& handle,
                         raft::host_scalar_view<double> median_abs_error)
 {
   RAFT_EXPECTS(predictions.extent(0) == ref_predictions.extent(0),
-               "Size mismatch betwen predictions and ref_predictions");
+               "Size mismatch between predictions and ref_predictions");
   RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous");
   RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous");
   RAFT_EXPECTS(mean_abs_error.data_handle() != nullptr, "mean_abs_error view must not be empty");
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
index 0b7d6436dd..fafddb7b23 100644
--- a/cpp/include/raft/stats/silhouette_score.cuh
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -100,13 +100,13 @@ value_t silhouette_score(
   idx_t n_unique_labels,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
 {
-  RAFT_EXPECTS(labels.extent(0) == X_in.extent(0), "Size mismatch betwen labels and data");
+  RAFT_EXPECTS(labels.extent(0) == X_in.extent(0), "Size mismatch between labels and data");
 
   value_t* silhouette_score_per_sample_ptr = nullptr;
   if (silhouette_score_per_sample.has_value()) {
     silhouette_score_per_sample_ptr = silhouette_score_per_sample.value().data_handle();
     RAFT_EXPECTS(silhouette_score_per_sample.value().extent(0) == X_in.extent(0),
-                 "Size mismatch betwen silhouette_score_per_sample and data");
+                 "Size mismatch between silhouette_score_per_sample and data");
   }
   return detail::silhouette_score(handle,
                                   X_in.data_handle(),
@@ -172,13 +172,13 @@ value_t silhouette_score_batched(
                 "of each mdspan argument must be an integral type.");
   static_assert(std::is_integral_v<label_t>,
                 "silhouette_score_batched: The label type must be an integral type.");
-  RAFT_EXPECTS(labels.extent(0) == X.extent(0), "Size mismatch betwen labels and data");
+  RAFT_EXPECTS(labels.extent(0) == X.extent(0), "Size mismatch between labels and data");
 
   value_t* scores_ptr = nullptr;
   if (silhouette_score_per_sample.has_value()) {
     scores_ptr = silhouette_score_per_sample.value().data_handle();
     RAFT_EXPECTS(silhouette_score_per_sample.value().extent(0) == X.extent(0),
-                 "Size mismatch betwen silhouette_score_per_sample and data");
+                 "Size mismatch between silhouette_score_per_sample and data");
   }
   return batched::detail::silhouette_score(handle,
                                            X.data_handle(),
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index b8f16695bc..c52dd35fd8 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -71,7 +71,7 @@ double v_measure(const raft::handle_t& handle,
                  double beta = 1.0)
 {
   RAFT_EXPECTS(truth_cluster_array.extent(0) == pred_cluster_array.extent(0),
-               "Size mismatch betwen truth_cluster_array and pred_cluster_array");
+               "Size mismatch between truth_cluster_array and pred_cluster_array");
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
 
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 65d1b2c35f..30a922b243 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -123,7 +123,8 @@ void weighted_mean(const raft::handle_t& handle,
 
   RAFT_EXPECTS(weights.extent(0) == weight_size,
                "Size mismatch between weights and expected weight_size");
-  RAFT_EXPECTS(mu.extent(0) == mean_vec_size, "Size mismatch betwen mu and expected mean_vec_size");
+  RAFT_EXPECTS(mu.extent(0) == mean_vec_size,
+               "Size mismatch between mu and expected mean_vec_size");
 
   detail::weightedMean(mu.data_handle(),
                        data.data_handle(),
diff --git a/cpp/include/raft/thirdparty/mdspan/README.md b/cpp/include/raft/thirdparty/mdspan/README.md
index 1fee071212..a062777261 100644
--- a/cpp/include/raft/thirdparty/mdspan/README.md
+++ b/cpp/include/raft/thirdparty/mdspan/README.md
@@ -69,5 +69,5 @@ This implementation is fully conforming with revision 14 of P0009 with a few exc
 Acknowledgements
 ================
 
-This work was undertaken as part of the [Kokkos project](https://github.com/kokkos/kokkos) at Sandia National Laboratories.  Sandia National Laboratories is a multimission laboratory managed and operated by National Technology & Engineering Solutions of Sandia, LLC, a wholly owned subsidary of Honeywell International Inc., for the U. S. Department of Energy's National Nuclear Security Administration under contract DE-NA0003525.
+This work was undertaken as part of the [Kokkos project](https://github.com/kokkos/kokkos) at Sandia National Laboratories.  Sandia National Laboratories is a multimission laboratory managed and operated by National Technology & Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International Inc., for the U. S. Department of Energy's National Nuclear Security Administration under contract DE-NA0003525.
 
diff --git a/cpp/include/raft/thirdparty/mdspan/cmake/metabench.cmake b/cpp/include/raft/thirdparty/mdspan/cmake/metabench.cmake
index bc59ce4bd8..8ba683ddce 100644
--- a/cpp/include/raft/thirdparty/mdspan/cmake/metabench.cmake
+++ b/cpp/include/raft/thirdparty/mdspan/cmake/metabench.cmake
@@ -223,7 +223,7 @@ endfunction()
 #   each `dataset` to be generated and a HTML chart to be generated from those
 #   datasets. Several aspects of the compilation can be displayed, such as
 #   compilation time and executable size. The aspect being plotted on the
-#   generated chart can be controled via the `ASPECT` argument.
+#   generated chart can be controlled via the `ASPECT` argument.
 #
 #   Parameters
 #   ----------
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
index 5f0f929e5e..d8edf31ab2 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
@@ -168,7 +168,7 @@ MDSPAN_STATIC_TEST(
   >::value
 );
 
-// TODO: Remove this test alltogether?
+// TODO: Remove this test altogether?
 // CT: Fails with GCC too after I removed the template parameter
 // I guess there is padding added after foo?
 #if 0
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
index 8cd2e14fb8..ed1478dc8b 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
@@ -219,7 +219,7 @@ class layout_left::mapping {
     }
 #endif
 
-    // Not really public, but currently needed to implement fully constexpr useable submdspan:
+    // Not really public, but currently needed to implement fully constexpr usable submdspan:
     template<size_t N, class SizeType, size_t ... E, size_t ... Idx>
     constexpr index_type __get_stride(std::experimental::extents<SizeType, E...>,integer_sequence<size_t, Idx...>) const {
       return _MDSPAN_FOLD_TIMES_RIGHT((Idx<N? __extents.template __extent<Idx>():1),1);
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
index 118f3632c0..a9b64ca36a 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
@@ -220,7 +220,7 @@ class layout_right::mapping {
     }
 #endif
 
-    // Not really public, but currently needed to implement fully constexpr useable submdspan:
+    // Not really public, but currently needed to implement fully constexpr usable submdspan:
     template<size_t N, class SizeType, size_t ... E, size_t ... Idx>
     constexpr index_type __get_stride(std::experimental::extents<SizeType, E...>,integer_sequence<size_t, Idx...>) const {
       return _MDSPAN_FOLD_TIMES_RIGHT((Idx>N? __extents.template __extent<Idx>():1),1);
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
index 8394ce83b8..ccd5d1ab86 100644
--- a/cpp/include/raft/util/cache.cuh
+++ b/cpp/include/raft/util/cache.cuh
@@ -44,7 +44,7 @@ namespace raft::cache {
  * whose size equals the associativity. These are the cache sets. If a cache
  * set is full, then new indices are stored by replacing the oldest entries.
  *
- * Using this index mapping we implement methods to store and retrive data from
+ * Using this index mapping we implement methods to store and retrieve data from
  * the cache buffer, where a unit of data that we are storing is math_t[n_vec].
  * For example in SVM we store full columns of the kernel matrix at each cache
  * entry.
@@ -300,7 +300,7 @@ class Cache {
 
     raft::update_host(n_cached, d_num_selected_out.data(), 1, stream);
 
-    // Similarily re-group the input indices
+    // Similarly re-group the input indices
     raft::copy(ws_tmp.data(), keys, n, stream);
     cub::DevicePartition::Flagged(d_temp_storage.data(),
                                   d_temp_storage_size,
diff --git a/cpp/include/raft/util/cache_util.cuh b/cpp/include/raft/util/cache_util.cuh
index 2d6f49eb19..4200be96e8 100644
--- a/cpp/include/raft/util/cache_util.cuh
+++ b/cpp/include/raft/util/cache_util.cuh
@@ -159,7 +159,7 @@ int DI arg_first_ge(const int* array, int n, int val)
  * @param [in] n number of elements in the array
  * @param [in] val the value we are searching for
  * @param [in] k
- * @return the idx of the k-th occurance of val in array, or -1 if
+ * @return the idx of the k-th occurrence of val in array, or -1 if
  * the value is not found.
  */
 int DI find_nth_occurrence(const int* array, int n, int val, int k)
diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh
index 1d1c82eb94..5818fc21f3 100644
--- a/cpp/include/raft/util/cuda_utils.cuh
+++ b/cpp/include/raft/util/cuda_utils.cuh
@@ -18,6 +18,7 @@
 
 #include <math_constants.h>
 #include <stdint.h>
+#include <type_traits>
 
 #include <raft/core/cudart_utils.hpp>
 
@@ -516,6 +517,16 @@ struct Nop {
   HDI Type operator()(Type in, IdxType i = 0) { return in; }
 };
 
+template <typename Type, typename IdxType = int>
+struct SqrtOp {
+  HDI Type operator()(Type in, IdxType i = 0) { return mySqrt(in); }
+};
+
+template <typename Type, typename IdxType = int>
+struct L0Op {
+  HDI Type operator()(Type in, IdxType i = 0) { return in != Type(0) ? Type(1) : Type(0); }
+};
+
 template <typename Type, typename IdxType = int>
 struct L1Op {
   HDI Type operator()(Type in, IdxType i = 0) { return myAbs(in); }
@@ -530,6 +541,11 @@ template <typename Type>
 struct Sum {
   HDI Type operator()(Type a, Type b) { return a + b; }
 };
+
+template <typename Type>
+struct Max {
+  HDI Type operator()(Type a, Type b) { return myMax(a, b); }
+};
 /** @} */
 
 /**
@@ -611,9 +627,52 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
   return inFlag;
 }
 
+/** For every thread in the warp, set the corresponding bit to the thread's flag value.  */
+DI uint32_t ballot(bool inFlag, uint32_t mask = 0xffffffffu)
+{
+#if CUDART_VERSION >= 9000
+  return __ballot_sync(mask, inFlag);
+#else
+  return __ballot(inFlag);
+#endif
+}
+
+/** True CUDA alignment of a type (adapted from CUB) */
+template <typename T>
+struct cuda_alignment {
+  struct Pad {
+    T val;
+    char byte;
+  };
+
+  static constexpr int bytes = sizeof(Pad) - sizeof(T);
+};
+
+template <typename LargeT, typename UnitT>
+struct is_multiple {
+  static constexpr int large_align_bytes = cuda_alignment<LargeT>::bytes;
+  static constexpr int unit_align_bytes  = cuda_alignment<UnitT>::bytes;
+  static constexpr bool value =
+    (sizeof(LargeT) % sizeof(UnitT) == 0) && (large_align_bytes % unit_align_bytes == 0);
+};
+
+template <typename LargeT, typename UnitT>
+inline constexpr bool is_multiple_v = is_multiple<LargeT, UnitT>::value;
+
+template <typename T>
+struct is_shuffleable {
+  static constexpr bool value =
+    std::is_same_v<T, int> || std::is_same_v<T, unsigned int> || std::is_same_v<T, long> ||
+    std::is_same_v<T, unsigned long> || std::is_same_v<T, long long> ||
+    std::is_same_v<T, unsigned long long> || std::is_same_v<T, float> || std::is_same_v<T, double>;
+};
+
+template <typename T>
+inline constexpr bool is_shuffleable_v = is_shuffleable<T>::value;
+
 /**
  * @brief Shuffle the data inside a warp
- * @tparam T the data type (currently assumed to be 4B)
+ * @tparam T the data type
  * @param val value to be shuffled
  * @param srcLane lane from where to shuffle
  * @param width lane width
@@ -621,7 +680,10 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
+DI std::enable_if_t<is_shuffleable_v<T>, T> shfl(T val,
+                                                 int srcLane,
+                                                 int width     = WarpSize,
+                                                 uint32_t mask = 0xffffffffu)
 {
 #if CUDART_VERSION >= 9000
   return __shfl_sync(mask, val, srcLane, width);
@@ -630,9 +692,40 @@ DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
 #endif
 }
 
+/// Overload of shfl for data types not supported by the CUDA intrinsics
+template <typename T>
+DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl(T val,
+                                                  int srcLane,
+                                                  int width     = WarpSize,
+                                                  uint32_t mask = 0xffffffffu)
+{
+  using UnitT =
+    std::conditional_t<is_multiple_v<T, int>,
+                       unsigned int,
+                       std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;
+
+  constexpr int n_words = sizeof(T) / sizeof(UnitT);
+
+  T output;
+  UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
+  UnitT* input_alias  = reinterpret_cast<UnitT*>(&val);
+
+  unsigned int shuffle_word;
+  shuffle_word    = shfl((unsigned int)input_alias[0], srcLane, width, mask);
+  output_alias[0] = shuffle_word;
+
+#pragma unroll
+  for (int i = 1; i < n_words; ++i) {
+    shuffle_word    = shfl((unsigned int)input_alias[i], srcLane, width, mask);
+    output_alias[i] = shuffle_word;
+  }
+
+  return output;
+}
+
 /**
  * @brief Shuffle the data inside a warp from lower lane IDs
- * @tparam T the data type (currently assumed to be 4B)
+ * @tparam T the data type
  * @param val value to be shuffled
  * @param delta lower lane ID delta from where to shuffle
  * @param width lane width
@@ -640,7 +733,10 @@ DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl_up(T val, int delta, int width = WarpSize, uint32_t mask = 0xffffffffu)
+DI std::enable_if_t<is_shuffleable_v<T>, T> shfl_up(T val,
+                                                    int delta,
+                                                    int width     = WarpSize,
+                                                    uint32_t mask = 0xffffffffu)
 {
 #if CUDART_VERSION >= 9000
   return __shfl_up_sync(mask, val, delta, width);
@@ -649,9 +745,40 @@ DI T shfl_up(T val, int delta, int width = WarpSize, uint32_t mask = 0xffffffffu
 #endif
 }
 
+/// Overload of shfl_up for data types not supported by the CUDA intrinsics
+template <typename T>
+DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_up(T val,
+                                                     int delta,
+                                                     int width     = WarpSize,
+                                                     uint32_t mask = 0xffffffffu)
+{
+  using UnitT =
+    std::conditional_t<is_multiple_v<T, int>,
+                       unsigned int,
+                       std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;
+
+  constexpr int n_words = sizeof(T) / sizeof(UnitT);
+
+  T output;
+  UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
+  UnitT* input_alias  = reinterpret_cast<UnitT*>(&val);
+
+  unsigned int shuffle_word;
+  shuffle_word    = shfl_up((unsigned int)input_alias[0], delta, width, mask);
+  output_alias[0] = shuffle_word;
+
+#pragma unroll
+  for (int i = 1; i < n_words; ++i) {
+    shuffle_word    = shfl_up((unsigned int)input_alias[i], delta, width, mask);
+    output_alias[i] = shuffle_word;
+  }
+
+  return output;
+}
+
 /**
  * @brief Shuffle the data inside a warp
- * @tparam T the data type (currently assumed to be 4B)
+ * @tparam T the data type
  * @param val value to be shuffled
  * @param laneMask mask to be applied in order to perform xor shuffle
  * @param width lane width
@@ -659,7 +786,10 @@ DI T shfl_up(T val, int delta, int width = WarpSize, uint32_t mask = 0xffffffffu
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu)
+DI std::enable_if_t<is_shuffleable_v<T>, T> shfl_xor(T val,
+                                                     int laneMask,
+                                                     int width     = WarpSize,
+                                                     uint32_t mask = 0xffffffffu)
 {
 #if CUDART_VERSION >= 9000
   return __shfl_xor_sync(mask, val, laneMask, width);
@@ -668,6 +798,37 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff
 #endif
 }
 
+/// Overload of shfl_xor for data types not supported by the CUDA intrinsics
+template <typename T>
+DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_xor(T val,
+                                                      int laneMask,
+                                                      int width     = WarpSize,
+                                                      uint32_t mask = 0xffffffffu)
+{
+  using UnitT =
+    std::conditional_t<is_multiple_v<T, int>,
+                       unsigned int,
+                       std::conditional_t<is_multiple_v<T, short>, unsigned short, unsigned char>>;
+
+  constexpr int n_words = sizeof(T) / sizeof(UnitT);
+
+  T output;
+  UnitT* output_alias = reinterpret_cast<UnitT*>(&output);
+  UnitT* input_alias  = reinterpret_cast<UnitT*>(&val);
+
+  unsigned int shuffle_word;
+  shuffle_word    = shfl_xor((unsigned int)input_alias[0], laneMask, width, mask);
+  output_alias[0] = shuffle_word;
+
+#pragma unroll
+  for (int i = 1; i < n_words; ++i) {
+    shuffle_word    = shfl_xor((unsigned int)input_alias[i], laneMask, width, mask);
+    output_alias[i] = shuffle_word;
+  }
+
+  return output;
+}
+
 /**
  * @brief Four-way byte dot product-accumulate.
  * @tparam T Four-byte integer: int or unsigned int
@@ -730,24 +891,55 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
 }
 
 /**
- * @brief Warp-level sum reduction
+ * @brief Logical-warp-level reduction
+ * @tparam logicalWarpSize Logical warp size (2, 4, 8, 16 or 32)
+ * @tparam T Value type to be reduced
+ * @tparam ReduceLambda Reduction operation type
  * @param val input value
+ * @param reduce_op Reduction operation
+ * @return Reduction result. All lanes will have the valid result.
+ */
+template <int logicalWarpSize, typename T, typename ReduceLambda>
+DI T logicalWarpReduce(T val, ReduceLambda reduce_op)
+{
+#pragma unroll
+  for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
+    T tmp = shfl_xor(val, i);
+    val   = reduce_op(val, tmp);
+  }
+  return val;
+}
+
+/**
+ * @brief Warp-level reduction
  * @tparam T Value type to be reduced
+ * @tparam ReduceLambda Reduction operation type
+ * @param val input value
+ * @param reduce_op Reduction operation
+ * @return Reduction result. All lanes will have the valid result.
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block. All threads in the warp must enter this
+ *       function together
+ */
+template <typename T, typename ReduceLambda>
+DI T warpReduce(T val, ReduceLambda reduce_op)
+{
+  return logicalWarpReduce<WarpSize>(val, reduce_op);
+}
+
+/**
+ * @brief Warp-level sum reduction
+ * @tparam T Value type to be reduced
+ * @param val input value
  * @return Reduction result. All lanes will have the valid result.
  * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
  *       number of warps in a block. All threads in the warp must enter this
  *       function together
- * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
 DI T warpReduce(T val)
 {
-#pragma unroll
-  for (int i = WarpSize / 2; i > 0; i >>= 1) {
-    T tmp = shfl_xor(val, i);
-    val += tmp;
-  }
-  return val;
+  return warpReduce(val, raft::Sum<T>{});
 }
 
 /**
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index e2b0c5ecec..68a95da587 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -246,7 +246,7 @@ class grid_1d_block_t {
  * @tparam Type data type
  * @param dst destination pointer
  * @param src source pointer
- * @param len lenth of the src/dst buffers in terms of number of elements
+ * @param len length of the src/dst buffers in terms of number of elements
  * @param stream cuda stream
  */
 template <typename Type>
@@ -355,6 +355,18 @@ inline int getMultiProcessorCount()
   return mpCount;
 }
 
+/** helper method to get major minor compute capability version */
+inline std::pair<int, int> getComputeCapability()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int majorVer, minorVer;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&majorVer, cudaDevAttrComputeCapabilityMajor, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minorVer, cudaDevAttrComputeCapabilityMinor, devId));
+
+  return std::make_pair(majorVer, minorVer);
+}
+
 /** helper method to convert an array on device to a string on host */
 template <typename T>
 std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
@@ -503,7 +515,7 @@ constexpr inline auto upper_bound<half>() -> half
  * resource in any case.
  *
  * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
- * into a `pool_memory_resource` if neccessary and return the pointer to the result.
+ * into a `pool_memory_resource` if necessary and return the pointer to the result.
  * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
  * to 256 bytes).
  *
diff --git a/cpp/include/raft/util/device_atomics.cuh b/cpp/include/raft/util/device_atomics.cuh
index 28f7516688..a79981124f 100644
--- a/cpp/include/raft/util/device_atomics.cuh
+++ b/cpp/include/raft/util/device_atomics.cuh
@@ -243,7 +243,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
 // `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
-// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
+// not supported.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
 // CUDA natively supports `unsigned long long int` for `atomicAdd`,
diff --git a/cpp/include/raft/util/vectorized.cuh b/cpp/include/raft/util/vectorized.cuh
index 21c44d2c93..5356f6a153 100644
--- a/cpp/include/raft/util/vectorized.cuh
+++ b/cpp/include/raft/util/vectorized.cuh
@@ -232,7 +232,7 @@ struct IOType<double, 2> {
  * to type promotion. It is then reinterpreted as a vector elements
  * to perform the kernel's work.
  *
- * Caution : vectorized accesses requires input adresses to be memory aligned
+ * Caution : vectorized accesses requires input addresses to be memory aligned
  * according not to the input type but to the promoted type used for reading.
  *
  * Example demonstrating the use of load operations, performing math on such
diff --git a/cpp/include/raft_distance/kmeans.hpp b/cpp/include/raft_distance/kmeans.hpp
index 19f92dd977..a56021b110 100644
--- a/cpp/include/raft_distance/kmeans.hpp
+++ b/cpp/include/raft_distance/kmeans.hpp
@@ -41,4 +41,19 @@ void update_centroids(raft::handle_t const& handle,
                       double* new_centroids,
                       double* weight_per_cluster);
 
-}  // namespace raft::cluster::kmeans::runtime
\ No newline at end of file
+void cluster_cost(raft::handle_t const& handle,
+                  const float* X,
+                  int n_samples,
+                  int n_features,
+                  int n_clusters,
+                  const float* centroids,
+                  float* cost);
+
+void cluster_cost(raft::handle_t const& handle,
+                  const double* X,
+                  int n_samples,
+                  int n_features,
+                  int n_clusters,
+                  const double* centroids,
+                  double* cost);
+}  // namespace raft::cluster::kmeans::runtime
diff --git a/cpp/scripts/__clang_cuda_additional_intrinsics.h b/cpp/scripts/__clang_cuda_additional_intrinsics.h
index 8964d210bf..b9c032dc45 100644
--- a/cpp/scripts/__clang_cuda_additional_intrinsics.h
+++ b/cpp/scripts/__clang_cuda_additional_intrinsics.h
@@ -1,3 +1,4 @@
+// Copyright (c) 2022, NVIDIA CORPORATION.
 #ifndef __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
 #define __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
 #ifndef __CUDA__
@@ -7,46 +8,49 @@
 // for some of these macros, see cuda_fp16.hpp
 #if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
-#define __LDG_PTR   "l"
-#define __LBITS "64"
+#define __LDG_PTR "l"
+#define __LBITS   "64"
 #else
-#define __LDG_PTR   "r"
-#define __LBITS "32"
-#endif // (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR "r"
+#define __LBITS   "32"
+#endif  // (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
 
 #define __NOARG
 
-#define __MAKE_LD(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)         \
-  __device__ __forceinline__ c_typ __ld ## cop (const c_typ* addr) {  \
-    int_typ out;                                                      \
-    asm("ld." #cop "." ptx_typ " %0, [%1];"                           \
-        : "=" inl_typ(out) : __LDG_PTR(addr)mem);                     \
-    return (c_typ)out;                                                \
+#define __MAKE_LD(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)                          \
+  __device__ __forceinline__ c_typ __ld##cop(const c_typ* addr)                        \
+  {                                                                                    \
+    int_typ out;                                                                       \
+    asm("ld." #cop "." ptx_typ " %0, [%1];" : "=" inl_typ(out) : __LDG_PTR(addr) mem); \
+    return (c_typ)out;                                                                 \
   }
 
-#define __MAKE_LD2(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)        \
-  __device__ __forceinline__ c_typ __ld ## cop (const c_typ* addr) {  \
-    int_typ out1, out2;                                               \
-    asm("ld." #cop ".v2." ptx_typ " {%0, %1}, [%2];"                  \
-        : "=" inl_typ(out1), "=" inl_typ(out2) : __LDG_PTR(addr)mem); \
-    c_typ out;                                                        \
-    out.x = out1;                                                     \
-    out.y = out2;                                                     \
-    return out;                                                       \
+#define __MAKE_LD2(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)  \
+  __device__ __forceinline__ c_typ __ld##cop(const c_typ* addr) \
+  {                                                             \
+    int_typ out1, out2;                                         \
+    asm("ld." #cop ".v2." ptx_typ " {%0, %1}, [%2];"            \
+        : "=" inl_typ(out1), "=" inl_typ(out2)                  \
+        : __LDG_PTR(addr) mem);                                 \
+    c_typ out;                                                  \
+    out.x = out1;                                               \
+    out.y = out2;                                               \
+    return out;                                                 \
   }
 
-#define __MAKE_LD4(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)        \
-  __device__ __forceinline__ c_typ __ld ## cop (const c_typ* addr) {  \
-    int_typ out1, out2, out3, out4;                                   \
-    asm("ld." #cop".v4." ptx_typ " {%0, %1, %2, %3}, [%4];"           \
-        : "=" inl_typ(out1), "=" inl_typ(out2),                       \
-        "=" inl_typ(out3), "=" inl_typ(out4) : __LDG_PTR(addr)mem);   \
-    c_typ out;                                                        \
-    out.x = out1;                                                     \
-    out.y = out2;                                                     \
-    out.z = out3;                                                     \
-    out.w = out4;                                                     \
-    return out;                                                       \
+#define __MAKE_LD4(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)                       \
+  __device__ __forceinline__ c_typ __ld##cop(const c_typ* addr)                      \
+  {                                                                                  \
+    int_typ out1, out2, out3, out4;                                                  \
+    asm("ld." #cop ".v4." ptx_typ " {%0, %1, %2, %3}, [%4];"                         \
+        : "=" inl_typ(out1), "=" inl_typ(out2), "=" inl_typ(out3), "=" inl_typ(out4) \
+        : __LDG_PTR(addr) mem);                                                      \
+    c_typ out;                                                                       \
+    out.x = out1;                                                                    \
+    out.y = out2;                                                                    \
+    out.z = out3;                                                                    \
+    out.w = out4;                                                                    \
+    return out;                                                                      \
   }
 
 __MAKE_LD(cg, char, short, "s8", "h", __NOARG)
@@ -82,7 +86,6 @@ __MAKE_LD4(cg, int4, int, "s32", "r", __NOARG)
 __MAKE_LD4(cg, uint4, unsigned int, "u32", "r", __NOARG)
 __MAKE_LD4(cg, float4, float, "f32", "f", __NOARG)
 
-
 __MAKE_LD(ca, char, short, "s8", "h", __NOARG)
 __MAKE_LD(ca, signed char, short, "s8", "h", __NOARG)
 __MAKE_LD(ca, unsigned char, short, "u8", "h", __NOARG)
@@ -116,7 +119,6 @@ __MAKE_LD4(ca, int4, int, "s32", "r", __NOARG)
 __MAKE_LD4(ca, uint4, unsigned int, "u32", "r", __NOARG)
 __MAKE_LD4(ca, float4, float, "f32", "f", __NOARG)
 
-
 __MAKE_LD(cs, char, short, "s8", "h", __NOARG)
 __MAKE_LD(cs, signed char, short, "s8", "h", __NOARG)
 __MAKE_LD(cs, unsigned char, short, "u8", "h", __NOARG)
@@ -150,7 +152,6 @@ __MAKE_LD4(cs, int4, int, "s32", "r", __NOARG)
 __MAKE_LD4(cs, uint4, unsigned int, "u32", "r", __NOARG)
 __MAKE_LD4(cs, float4, float, "f32", "f", __NOARG)
 
-
 __MAKE_LD(lu, char, short, "s8", "h", : "memory")
 __MAKE_LD(lu, signed char, short, "s8", "h", : "memory")
 __MAKE_LD(lu, unsigned char, short, "u8", "h", : "memory")
@@ -184,7 +185,6 @@ __MAKE_LD4(lu, int4, int, "s32", "r", : "memory")
 __MAKE_LD4(lu, uint4, unsigned int, "u32", "r", : "memory")
 __MAKE_LD4(lu, float4, float, "f32", "f", : "memory")
 
-
 __MAKE_LD(cv, char, short, "s8", "h", : "memory")
 __MAKE_LD(cv, signed char, short, "s8", "h", : "memory")
 __MAKE_LD(cv, unsigned char, short, "u8", "h", : "memory")
@@ -218,26 +218,30 @@ __MAKE_LD4(cv, int4, int, "s32", "r", : "memory")
 __MAKE_LD4(cv, uint4, unsigned int, "u32", "r", : "memory")
 __MAKE_LD4(cv, float4, float, "f32", "f", : "memory")
 
-
-#define __MAKE_ST(cop, c_typ, int_typ, ptx_typ, inl_typ)                \
-  __device__ __forceinline__ void __st ## cop (c_typ* addr, c_typ v) {  \
-    asm("st." #cop "." ptx_typ " [%0], %1;"                             \
-        :: __LDG_PTR(addr), inl_typ((int_typ)v) : "memory");            \
+#define __MAKE_ST(cop, c_typ, int_typ, ptx_typ, inl_typ)                                        \
+  __device__ __forceinline__ void __st##cop(c_typ* addr, c_typ v)                               \
+  {                                                                                             \
+    asm("st." #cop "." ptx_typ " [%0], %1;" ::__LDG_PTR(addr), inl_typ((int_typ)v) : "memory"); \
   }
 
-#define __MAKE_ST2(cop, c_typ, int_typ, ptx_typ, inl_typ)               \
-  __device__ __forceinline__ void __st ## cop (c_typ* addr, c_typ v) {  \
-    int_typ v1 = v.x, v2 = v.y;                                         \
-    asm("st." #cop ".v2." ptx_typ " [%0], {%1, %2};"                    \
-        :: __LDG_PTR(addr), inl_typ(v1), inl_typ(v2) : "memory");       \
+#define __MAKE_ST2(cop, c_typ, int_typ, ptx_typ, inl_typ)                                        \
+  __device__ __forceinline__ void __st##cop(c_typ* addr, c_typ v)                                \
+  {                                                                                              \
+    int_typ v1 = v.x, v2 = v.y;                                                                  \
+    asm("st." #cop ".v2." ptx_typ " [%0], {%1, %2};" ::__LDG_PTR(addr), inl_typ(v1), inl_typ(v2) \
+        : "memory");                                                                             \
   }
 
-#define __MAKE_ST4(cop, c_typ, int_typ, ptx_typ, inl_typ)               \
-  __device__ __forceinline__ c_typ __st ## cop (c_typ* addr, c_typ v) { \
-    int_typ v1 = v.x, v2 = v.y, v3 = v.z, v4 = v.w;                     \
-    asm("st." #cop ".v4." ptx_typ " [%0], {%1, %2, %3, %4};"            \
-        :: __LDG_PTR(addr), inl_typ(v1), inl_typ(v2),                   \
-        inl_typ(v3), inl_typ(v4) : "memory");                           \
+#define __MAKE_ST4(cop, c_typ, int_typ, ptx_typ, inl_typ)                       \
+  __device__ __forceinline__ c_typ __st##cop(c_typ* addr, c_typ v)              \
+  {                                                                             \
+    int_typ v1 = v.x, v2 = v.y, v3 = v.z, v4 = v.w;                             \
+    asm("st." #cop ".v4." ptx_typ " [%0], {%1, %2, %3, %4};" ::__LDG_PTR(addr), \
+        inl_typ(v1),                                                            \
+        inl_typ(v2),                                                            \
+        inl_typ(v3),                                                            \
+        inl_typ(v4)                                                             \
+        : "memory");                                                            \
   }
 
 __MAKE_ST(wb, char, short, "s8", "h")
@@ -273,7 +277,6 @@ __MAKE_ST4(wb, int4, int, "s32", "r")
 __MAKE_ST4(wb, uint4, unsigned int, "u32", "r")
 __MAKE_ST4(wb, float4, float, "f32", "f")
 
-
 __MAKE_ST(cg, char, short, "s8", "h")
 __MAKE_ST(cg, signed char, short, "s8", "h")
 __MAKE_ST(cg, unsigned char, short, "u8", "h")
@@ -307,7 +310,6 @@ __MAKE_ST4(cg, int4, int, "s32", "r")
 __MAKE_ST4(cg, uint4, unsigned int, "u32", "r")
 __MAKE_ST4(cg, float4, float, "f32", "f")
 
-
 __MAKE_ST(cs, char, short, "s8", "h")
 __MAKE_ST(cs, signed char, short, "s8", "h")
 __MAKE_ST(cs, unsigned char, short, "u8", "h")
@@ -341,7 +343,6 @@ __MAKE_ST4(cs, int4, int, "s32", "r")
 __MAKE_ST4(cs, uint4, unsigned int, "u32", "r")
 __MAKE_ST4(cs, float4, float, "f32", "f")
 
-
 __MAKE_ST(wt, char, short, "s8", "h")
 __MAKE_ST(wt, signed char, short, "s8", "h")
 __MAKE_ST(wt, unsigned char, short, "u8", "h")
@@ -375,7 +376,6 @@ __MAKE_ST4(wt, int4, int, "s32", "r")
 __MAKE_ST4(wt, uint4, unsigned int, "u32", "r")
 __MAKE_ST4(wt, float4, float, "f32", "f")
 
-
 #undef __MAKE_ST4
 #undef __MAKE_ST2
 #undef __MAKE_ST
@@ -386,6 +386,6 @@ __MAKE_ST4(wt, float4, float, "f32", "f")
 #undef __LBITS
 #undef __LDG_PTR
 
-#endif // defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
+#endif  // defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
 
-#endif // defined(__CLANG_CUDA_ADDITIONAL_INTRINSICS_H__)
+#endif  // defined(__CLANG_CUDA_ADDITIONAL_INTRINSICS_H__)
diff --git a/cpp/scripts/gitutils.py b/cpp/scripts/gitutils.py
index 8d4af79129..f6d6b97413 100644
--- a/cpp/scripts/gitutils.py
+++ b/cpp/scripts/gitutils.py
@@ -211,8 +211,8 @@ def modifiedFiles(pathFilter=None):
     If inside a CI-env (ie. TARGET_BRANCH and COMMIT_HASH are defined, and
     current branch is "current-pr-branch"), then lists out all files modified
     between these 2 branches. Locally, TARGET_BRANCH will try to be determined
-    from the current repo version and finding a coresponding branch named
-    'branch-{major}.{minor}'. If this fails, this functino will list out all
+    from the current repo version and finding a corresponding branch named
+    'branch-{major}.{minor}'. If this fails, this function will list out all
     the uncommitted files in the current branch.
 
     Such utility function is helpful while putting checker scripts as part of
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
new file mode 100755
index 0000000000..db5a8b5804
--- /dev/null
+++ b/cpp/scripts/run-cmake-format.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# This script is a wrapper for cmakelang that may be used with pre-commit. The
+# wrapping is necessary because RAPIDS libraries split configuration for
+# cmakelang linters between a local config file and a second config file that's
+# shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
+# this file is only maintained in one place (the rapids-cmake repo) and
+# pulled down during builds. We need a way to invoke CMake linting commands
+# without causing pre-commit failures (which could block local commits or CI),
+# while also being sufficiently flexible to allow users to maintain the config
+# file independently of a build directory.
+#
+# This script provides the minimal functionality to enable those use cases. It
+# searches in a number of predefined locations for the rapids-cmake config file
+# and exits gracefully if the file is not found. If a user wishes to specify a
+# config file at a nonstandard location, they may do so by setting the
+# environment variable RAPIDS_CMAKE_FORMAT_FILE.
+# 
+# This script can be invoked directly anywhere within the project repository.
+# Alternatively, it may be invoked as a pre-commit hook via
+# `pre-commit run (cmake-format)|(cmake-lint)`.
+#
+# Usage:
+# bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]
+
+status=0
+if [ -z ${RAFT_ROOT:+PLACEHOLDER} ]; then
+    RAFT_BUILD_DIR=$(git rev-parse --show-toplevel 2>&1)/cpp/build
+    status=$?
+else
+    RAFT_BUILD_DIR=${RAFT_ROOT}
+fi
+
+if ! [ ${status} -eq 0 ]; then
+    if [[ ${RAFT_BUILD_DIR} == *"not a git repository"* ]]; then
+        echo "This script must be run inside the raft repository, or the RAFT_ROOT environment variable must be set."
+    else
+        echo "Script failed with unknown error attempting to determine project root:"
+        echo ${RAFT_BUILD_DIR}
+    fi
+    exit 1
+fi
+
+DEFAULT_FORMAT_FILE_LOCATIONS=(
+  "${RAFT_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+)
+
+if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
+    for file_path in ${DEFAULT_FORMAT_FILE_LOCATIONS[@]}; do
+        if [ -f ${file_path} ]; then
+            RAPIDS_CMAKE_FORMAT_FILE=${file_path}
+            break
+        fi
+    done
+fi
+
+if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
+  echo "The rapids-cmake cmake-format configuration file was not found at any of the default search locations: "
+  echo ""
+  ( IFS=$'\n'; echo "${DEFAULT_FORMAT_FILE_LOCATIONS[*]}" )
+  echo ""
+  echo "Try setting the environment variable RAPIDS_CMAKE_FORMAT_FILE to the path to the config file."
+  exit 0
+else
+  echo "Using format file ${RAPIDS_CMAKE_FORMAT_FILE}"
+fi
+
+if [[ $1 == "cmake-format" ]]; then
+  cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+elif [[ $1 == "cmake-lint" ]]; then
+  # Since the pre-commit hook is verbose, we have to be careful to only
+  # present cmake-lint's output (which is quite verbose) if we actually
+  # observe a failure.
+  OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2})
+  status=$?
+
+  if ! [ ${status} -eq 0 ]; then
+    echo "${OUTPUT}"
+  fi
+  exit ${status}
+fi
diff --git a/cpp/src/distance/cluster_cost.cuh b/cpp/src/distance/cluster_cost.cuh
new file mode 100644
index 0000000000..344673830b
--- /dev/null
+++ b/cpp/src/distance/cluster_cost.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cluster/kmeans.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/handle.hpp>
+
+namespace raft::cluster::kmeans::runtime {
+template <typename ElementType, typename IndexType>
+void cluster_cost(const raft::handle_t& handle,
+                  const ElementType* X,
+                  IndexType n_samples,
+                  IndexType n_features,
+                  IndexType n_clusters,
+                  const ElementType* centroids,
+                  ElementType* cost)
+{
+  rmm::device_uvector<char> workspace(n_samples * sizeof(IndexType), handle.get_stream());
+
+  rmm::device_uvector<ElementType> x_norms(n_samples, handle.get_stream());
+  rmm::device_uvector<ElementType> centroid_norms(n_clusters, handle.get_stream());
+  raft::linalg::rowNorm(
+    x_norms.data(), X, n_features, n_samples, raft::linalg::L2Norm, true, handle.get_stream());
+  raft::linalg::rowNorm(centroid_norms.data(),
+                        centroids,
+                        n_features,
+                        n_clusters,
+                        raft::linalg::L2Norm,
+                        true,
+                        handle.get_stream());
+
+  auto min_cluster_distance =
+    raft::make_device_vector<raft::KeyValuePair<IndexType, ElementType>>(handle, n_samples);
+  raft::distance::fusedL2NNMinReduce(min_cluster_distance.data_handle(),
+                                     X,
+                                     centroids,
+                                     x_norms.data(),
+                                     centroid_norms.data(),
+                                     n_samples,
+                                     n_clusters,
+                                     n_features,
+                                     (void*)workspace.data(),
+                                     false,
+                                     true,
+                                     handle.get_stream());
+
+  auto distances = raft::make_device_vector<ElementType, IndexType>(handle, n_samples);
+  thrust::transform(
+    handle.get_thrust_policy(),
+    min_cluster_distance.data_handle(),
+    min_cluster_distance.data_handle() + n_samples,
+    distances.data_handle(),
+    [] __device__(const raft::KeyValuePair<IndexType, ElementType>& a) { return a.value; });
+
+  rmm::device_scalar<ElementType> device_cost(0, handle.get_stream());
+  raft::cluster::kmeans::cluster_cost(
+    handle,
+    distances.view(),
+    workspace,
+    make_device_scalar_view<ElementType>(device_cost.data()),
+    [] __device__(const ElementType& a, const ElementType& b) { return a + b; });
+
+  raft::update_host(cost, device_cost.data(), 1, handle.get_stream());
+}
+}  // namespace raft::cluster::kmeans::runtime
diff --git a/cpp/src/distance/cluster_cost_double.cu b/cpp/src/distance/cluster_cost_double.cu
new file mode 100644
index 0000000000..b811b0bf8d
--- /dev/null
+++ b/cpp/src/distance/cluster_cost_double.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cluster_cost.cuh"
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/specializations.cuh>
+#include <raft/handle.hpp>
+
+namespace raft::cluster::kmeans::runtime {
+
+void cluster_cost(const raft::handle_t& handle,
+                  const double* X,
+                  int n_samples,
+                  int n_features,
+                  int n_clusters,
+                  const double* centroids,
+                  double* cost)
+{
+  cluster_cost<double, int>(handle, X, n_samples, n_features, n_clusters, centroids, cost);
+}
+}  // namespace raft::cluster::kmeans::runtime
diff --git a/cpp/src/distance/cluster_cost_float.cu b/cpp/src/distance/cluster_cost_float.cu
new file mode 100644
index 0000000000..d78ea446da
--- /dev/null
+++ b/cpp/src/distance/cluster_cost_float.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cluster_cost.cuh"
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/specializations.cuh>
+#include <raft/handle.hpp>
+
+namespace raft::cluster::kmeans::runtime {
+
+void cluster_cost(const raft::handle_t& handle,
+                  const float* X,
+                  int n_samples,
+                  int n_features,
+                  int n_clusters,
+                  const float* centroids,
+                  float* cost)
+{
+  cluster_cost<float, int>(handle, X, n_samples, n_features, n_clusters, centroids, cost);
+}
+}  // namespace raft::cluster::kmeans::runtime
diff --git a/cpp/src/nn/specializations/detail/ivfpq_build.cu b/cpp/src/nn/specializations/detail/ivfpq_build.cu
new file mode 100644
index 0000000000..9ff22a3729
--- /dev/null
+++ b/cpp/src/nn/specializations/detail/ivfpq_build.cu
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations/ivf_pq_specialization.hpp>
+
+namespace raft::neighbors::ivf_pq {
+
+#define RAFT_INST_BUILD_EXTEND(T, IdxT)                                           \
+  auto build(const handle_t& handle,                                              \
+             const index_params& params,                                          \
+             const T* dataset,                                                    \
+             IdxT n_rows,                                                         \
+             uint32_t dim)                                                        \
+    ->index<IdxT>                                                                 \
+  {                                                                               \
+    return build<T, IdxT>(handle, params, dataset, n_rows, dim);                  \
+  }                                                                               \
+  auto extend(const handle_t& handle,                                             \
+              const index<IdxT>& orig_index,                                      \
+              const T* new_vectors,                                               \
+              const IdxT* new_indices,                                            \
+              IdxT n_rows)                                                        \
+    ->index<IdxT>                                                                 \
+  {                                                                               \
+    return extend<T, IdxT>(handle, orig_index, new_vectors, new_indices, n_rows); \
+  }                                                                               \
+                                                                                  \
+  void build(const handle_t& handle,                                              \
+             const index_params& params,                                          \
+             const T* dataset,                                                    \
+             IdxT n_rows,                                                         \
+             uint32_t dim,                                                        \
+             index<IdxT>* idx)                                                    \
+  {                                                                               \
+    *idx = build<T, IdxT>(handle, params, dataset, n_rows, dim);                  \
+  }                                                                               \
+  void extend(const handle_t& handle,                                             \
+              index<IdxT>* idx,                                                   \
+              const T* new_vectors,                                               \
+              const IdxT* new_indices,                                            \
+              IdxT n_rows)                                                        \
+  {                                                                               \
+    extend<T, IdxT>(handle, idx, new_vectors, new_indices, n_rows);               \
+  }
+
+RAFT_INST_BUILD_EXTEND(float, uint64_t);
+RAFT_INST_BUILD_EXTEND(int8_t, uint64_t);
+RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t);
+
+#undef RAFT_INST_BUILD_EXTEND
+
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/nn/specializations/detail/ivfpq_search.cu b/cpp/src/nn/specializations/detail/ivfpq_search.cu
new file mode 100644
index 0000000000..80bf589803
--- /dev/null
+++ b/cpp/src/nn/specializations/detail/ivfpq_search.cu
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/specializations/ivf_pq_specialization.hpp>
+
+namespace raft::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                          \
+  void search(const handle_t& handle,                                                      \
+              const search_params& params,                                                 \
+              const index<IdxT>& idx,                                                      \
+              const T* queries,                                                            \
+              uint32_t n_queries,                                                          \
+              uint32_t k,                                                                  \
+              IdxT* neighbors,                                                             \
+              float* distances,                                                            \
+              rmm::mr::device_memory_resource* mr)                                         \
+  {                                                                                        \
+    search<T, IdxT>(handle, params, idx, queries, n_queries, k, neighbors, distances, mr); \
+  }
+
+RAFT_SEARCH_INST(float, uint64_t);
+RAFT_SEARCH_INST(int8_t, uint64_t);
+RAFT_SEARCH_INST(uint8_t, uint64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0f5ebabcb9..dae0f6f6b1 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,41 +1,37 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
-
-###################################################################################################
-# - compiler function -----------------------------------------------------------------------------
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
 
 function(ConfigureTest)
 
-    set(options OPTIONAL DIST NN)
-    set(oneValueArgs NAME )
-    set(multiValueArgs PATH TARGETS CONFIGURATIONS)
+  set(options OPTIONAL DIST NN)
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
-    cmake_parse_arguments(ConfigureTest "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN} )
+  cmake_parse_arguments(ConfigureTest "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    set(TEST_NAME ${ConfigureTest_NAME})
+  set(TEST_NAME ${ConfigureTest_NAME})
 
-    add_executable(${TEST_NAME} ${ConfigureTest_PATH})
+  add_executable(${TEST_NAME} ${ConfigureTest_PATH})
 
-    message("TEST PATH: ${ConfigureTest_PATH}")
+  message("TEST PATH: ${ConfigureTest_PATH}")
 
-    target_link_libraries(${TEST_NAME}
-            PRIVATE
-            raft::raft
+  target_link_libraries(
+    ${TEST_NAME}
+    PRIVATE raft::raft
             $<$<BOOL:${ConfigureTest_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureTest_NN}>:raft::nn>
             GTest::gtest
@@ -43,251 +39,255 @@ function(ConfigureTest)
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
-            )
+  )
 
-    add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+  add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
 
-    set_target_properties(${TEST_NAME}
-            PROPERTIES
-            # set target compile options
-            INSTALL_RPATH "\$ORIGIN/../../../lib"
-            CXX_STANDARD                        17
-            CXX_STANDARD_REQUIRED               ON
-            CUDA_STANDARD                       17
-            CUDA_STANDARD_REQUIRED              ON
-            )
+  set_target_properties(
+    ${TEST_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+  )
 
-    target_compile_options(${TEST_NAME}
-            PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-            "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-            )
+  target_compile_options(
+    ${TEST_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                         "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
 
-    target_include_directories(${TEST_NAME}
-            PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
-            )
+  target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
-    install(
-            TARGETS ${TEST_NAME}
-            COMPONENT testing
-            DESTINATION bin/gtests/libraft
-            EXCLUDE_FROM_ALL)
+  install(
+    TARGETS ${TEST_NAME}
+    COMPONENT testing
+    DESTINATION bin/gtests/libraft
+    EXCLUDE_FROM_ALL
+  )
 endfunction()
 
+# ##################################################################################################
+# test sources ##################################################################################
+# ##################################################################################################
 
-###################################################################################################
-### test sources ##################################################################################
-###################################################################################################
-
-###################################################################################################
-# - distance tests -------------------------------------------------------------------------
+# ##################################################################################################
+# * distance tests -------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-    ConfigureTest(NAME CLUSTER_TEST
-            PATH
-            test/cluster/kmeans.cu
-            test/cluster_solvers.cu
-            test/cluster/linkage.cu
-            OPTIONAL DIST NN
-    )
+  ConfigureTest(
+    NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster_solvers.cu test/cluster/linkage.cu
+    OPTIONAL DIST NN
+  )
 
-    ConfigureTest(NAME CORE_TEST
-            PATH
-            test/common/logger.cpp
-            test/handle.cpp
-            test/interruptible.cu
-            test/nvtx.cpp
-            test/mdarray.cu
-            test/mdspan_utils.cu
-            test/memory_type.cpp
-            test/span.cpp
-            test/span.cu
-            test/test.cpp
-    )
+  ConfigureTest(
+    NAME
+    CORE_TEST
+    PATH
+    test/common/logger.cpp
+    test/handle.cpp
+    test/interruptible.cu
+    test/nvtx.cpp
+    test/mdarray.cu
+    test/mdspan_utils.cu
+    test/memory_type.cpp
+    test/span.cpp
+    test/span.cu
+    test/test.cpp
+  )
 
-    ConfigureTest(NAME DISTANCE_TEST
-            PATH
-            test/distance/dist_adj.cu
-            test/distance/dist_canberra.cu
-            test/distance/dist_chebyshev.cu
-            test/distance/dist_correlation.cu
-            test/distance/dist_cos.cu
-            test/distance/dist_euc_exp.cu
-            test/distance/dist_euc_unexp.cu
-            test/distance/dist_hamming.cu
-            test/distance/dist_hellinger.cu
-            test/distance/dist_jensen_shannon.cu
-            test/distance/dist_kl_divergence.cu
-            test/distance/dist_l1.cu
-            test/distance/dist_minkowski.cu
-            test/distance/dist_russell_rao.cu
-            test/distance/fused_l2_nn.cu
-            test/distance/gram.cu
-            OPTIONAL DIST
-    )
+  ConfigureTest(
+    NAME
+    DISTANCE_TEST
+    PATH
+    test/distance/dist_adj.cu
+    test/distance/dist_canberra.cu
+    test/distance/dist_chebyshev.cu
+    test/distance/dist_correlation.cu
+    test/distance/dist_cos.cu
+    test/distance/dist_euc_exp.cu
+    test/distance/dist_euc_unexp.cu
+    test/distance/dist_eucsqrt_exp.cu
+    test/distance/dist_hamming.cu
+    test/distance/dist_hellinger.cu
+    test/distance/dist_jensen_shannon.cu
+    test/distance/dist_kl_divergence.cu
+    test/distance/dist_l1.cu
+    test/distance/dist_minkowski.cu
+    test/distance/dist_russell_rao.cu
+    test/distance/fused_l2_nn.cu
+    test/distance/gram.cu
+    OPTIONAL
+    DIST
+  )
 
-    ConfigureTest(NAME LABEL_TEST
-            PATH
-            test/label/label.cu
-            test/label/merge_labels.cu
-    )
+  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
 
-    ConfigureTest(NAME LINALG_TEST
-            PATH
-            test/linalg/add.cu
-            test/linalg/axpy.cu
-            test/linalg/binary_op.cu
-            test/linalg/cholesky_r1.cu
-            test/linalg/coalesced_reduction.cu
-            test/linalg/divide.cu
-            test/linalg/dot.cu
-            test/linalg/eig.cu
-            test/linalg/eig_sel.cu
-            test/linalg/gemm_layout.cu
-            test/linalg/gemv.cu
-            test/linalg/map.cu
-            test/linalg/map_then_reduce.cu
-            test/linalg/matrix_vector.cu
-            test/linalg/matrix_vector_op.cu
-            test/linalg/mean_squared_error.cu
-            test/linalg/multiply.cu
-            test/linalg/norm.cu
-            test/linalg/power.cu
-            test/linalg/reduce.cu
-            test/linalg/reduce_cols_by_key.cu
-            test/linalg/reduce_rows_by_key.cu
-            test/linalg/rsvd.cu
-            test/linalg/sqrt.cu
-            test/linalg/strided_reduction.cu
-            test/linalg/subtract.cu
-            test/linalg/svd.cu
-            test/linalg/ternary_op.cu
-            test/linalg/transpose.cu
-            test/linalg/unary_op.cu
-    )
+  ConfigureTest(
+    NAME
+    LINALG_TEST
+    PATH
+    test/linalg/add.cu
+    test/linalg/axpy.cu
+    test/linalg/binary_op.cu
+    test/linalg/cholesky_r1.cu
+    test/linalg/coalesced_reduction.cu
+    test/linalg/divide.cu
+    test/linalg/dot.cu
+    test/linalg/eig.cu
+    test/linalg/eig_sel.cu
+    test/linalg/gemm_layout.cu
+    test/linalg/gemv.cu
+    test/linalg/map.cu
+    test/linalg/map_then_reduce.cu
+    test/linalg/matrix_vector.cu
+    test/linalg/matrix_vector_op.cu
+    test/linalg/mean_squared_error.cu
+    test/linalg/multiply.cu
+    test/linalg/norm.cu
+    test/linalg/normalize.cu
+    test/linalg/power.cu
+    test/linalg/reduce.cu
+    test/linalg/reduce_cols_by_key.cu
+    test/linalg/reduce_rows_by_key.cu
+    test/linalg/rsvd.cu
+    test/linalg/sqrt.cu
+    test/linalg/strided_reduction.cu
+    test/linalg/subtract.cu
+    test/linalg/svd.cu
+    test/linalg/ternary_op.cu
+    test/linalg/transpose.cu
+    test/linalg/unary_op.cu
+  )
 
-    ConfigureTest(NAME MATRIX_TEST
-            PATH
-            test/matrix/argmax.cu
-            test/matrix/argmin.cu
-            test/matrix/columnSort.cu
-            test/matrix/diagonal.cu
-            test/matrix/gather.cu
-            test/matrix/linewise_op.cu
-            test/matrix/math.cu
-            test/matrix/matrix.cu
-            test/matrix/norm.cu
-            test/matrix/reverse.cu
-            test/matrix/slice.cu
-            test/matrix/triangular.cu
-            test/spectral_matrix.cu
-    )
+  ConfigureTest(
+    NAME
+    MATRIX_TEST
+    PATH
+    test/matrix/argmax.cu
+    test/matrix/argmin.cu
+    test/matrix/columnSort.cu
+    test/matrix/diagonal.cu
+    test/matrix/gather.cu
+    test/matrix/linewise_op.cu
+    test/matrix/math.cu
+    test/matrix/matrix.cu
+    test/matrix/norm.cu
+    test/matrix/reverse.cu
+    test/matrix/slice.cu
+    test/matrix/triangular.cu
+    test/spectral_matrix.cu
+  )
 
-    ConfigureTest(NAME RANDOM_TEST
-            PATH
-            test/random/make_blobs.cu
-            test/random/make_regression.cu
-            test/random/multi_variable_gaussian.cu
-            test/random/permute.cu
-            test/random/rng.cu
-            test/random/rng_int.cu
-            test/random/rmat_rectangular_generator.cu
-            test/random/sample_without_replacement.cu
-    )
+  ConfigureTest(
+    NAME
+    RANDOM_TEST
+    PATH
+    test/random/make_blobs.cu
+    test/random/make_regression.cu
+    test/random/multi_variable_gaussian.cu
+    test/random/permute.cu
+    test/random/rng.cu
+    test/random/rng_int.cu
+    test/random/rmat_rectangular_generator.cu
+    test/random/sample_without_replacement.cu
+  )
 
-    ConfigureTest(NAME SOLVERS_TEST
-            PATH
-            test/cluster_solvers_deprecated.cu
-            test/eigen_solvers.cu
-            test/lap/lap.cu
-            test/mst.cu
-    )
+  ConfigureTest(
+    NAME SOLVERS_TEST PATH test/cluster_solvers_deprecated.cu test/eigen_solvers.cu test/lap/lap.cu
+    test/mst.cu OPTIONAL DIST
+  )
 
-    ConfigureTest(NAME SPARSE_TEST
-            PATH
-            test/sparse/add.cu
-            test/sparse/convert_coo.cu
-            test/sparse/convert_csr.cu
-            test/sparse/csr_row_slice.cu
-            test/sparse/csr_to_dense.cu
-            test/sparse/csr_transpose.cu
-            test/sparse/degree.cu
-            test/sparse/filter.cu
-            test/sparse/norm.cu
-            test/sparse/reduce.cu
-            test/sparse/row_op.cu
-            test/sparse/sort.cu
-            test/sparse/symmetrize.cu
-    )
+  ConfigureTest(
+    NAME
+    SPARSE_TEST
+    PATH
+    test/sparse/add.cu
+    test/sparse/convert_coo.cu
+    test/sparse/convert_csr.cu
+    test/sparse/csr_row_slice.cu
+    test/sparse/csr_to_dense.cu
+    test/sparse/csr_transpose.cu
+    test/sparse/degree.cu
+    test/sparse/filter.cu
+    test/sparse/norm.cu
+    test/sparse/reduce.cu
+    test/sparse/row_op.cu
+    test/sparse/sort.cu
+    test/sparse/spgemmi.cu
+    test/sparse/symmetrize.cu
+  )
 
-    ConfigureTest(NAME SPARSE_DIST_TEST
-            PATH
-            test/sparse/dist_coo_spmv.cu
-            test/sparse/distance.cu
-            OPTIONAL DIST NN
-    )
+  ConfigureTest(
+    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu OPTIONAL DIST
+    NN
+  )
 
-    ConfigureTest(NAME SPARSE_NEIGHBORS_TEST
-            PATH
-            test/sparse/neighbors/connect_components.cu
-            test/sparse/neighbors/brute_force.cu
-            test/sparse/neighbors/knn_graph.cu
-            OPTIONAL DIST NN
-    )
+  ConfigureTest(
+    NAME SPARSE_NEIGHBORS_TEST PATH test/sparse/neighbors/connect_components.cu
+    test/sparse/neighbors/brute_force.cu test/sparse/neighbors/knn_graph.cu OPTIONAL DIST NN
+  )
 
-    ConfigureTest(NAME NEIGHBORS_TEST
-            PATH
-            test/neighbors/ann_ivf_flat.cu
-            test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-            test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-            test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
-            test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
-            test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
-            test/neighbors/knn.cu
-            test/neighbors/fused_l2_knn.cu
-            test/neighbors/haversine.cu
-            test/neighbors/ball_cover.cu
-            test/neighbors/epsilon_neighborhood.cu
-            test/neighbors/faiss_mr.cu
-            test/neighbors/selection.cu
-            OPTIONAL DIST NN
-    )
+  ConfigureTest(
+    NAME
+    NEIGHBORS_TEST
+    PATH
+    test/neighbors/ann_ivf_flat.cu
+    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+    test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
+    test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
+    test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
+    test/neighbors/knn.cu
+    test/neighbors/fused_l2_knn.cu
+    test/neighbors/haversine.cu
+    test/neighbors/ball_cover.cu
+    test/neighbors/epsilon_neighborhood.cu
+    test/neighbors/faiss_mr.cu
+    test/neighbors/refine.cu
+    test/neighbors/selection.cu
+    OPTIONAL
+    DIST
+    NN
+  )
 
-    ConfigureTest(NAME STATS_TEST
-            PATH
-            test/stats/accuracy.cu
-            test/stats/adjusted_rand_index.cu
-            test/stats/completeness_score.cu
-            test/stats/contingencyMatrix.cu
-            test/stats/cov.cu
-            test/stats/dispersion.cu
-            test/stats/entropy.cu
-            test/stats/histogram.cu
-            test/stats/homogeneity_score.cu
-            test/stats/information_criterion.cu
-            test/stats/kl_divergence.cu
-            test/stats/mean.cu
-            test/stats/meanvar.cu
-            test/stats/mean_center.cu
-            test/stats/minmax.cu
-            test/stats/mutual_info_score.cu
-            test/stats/r2_score.cu
-            test/stats/rand_index.cu
-            test/stats/regression_metrics.cu
-            test/stats/silhouette_score.cu
-            test/stats/stddev.cu
-            test/stats/sum.cu
-            test/stats/trustworthiness.cu
-            test/stats/weighted_mean.cu
-            test/stats/v_measure.cu
-            OPTIONAL DIST NN
-    )
+  ConfigureTest(
+    NAME
+    STATS_TEST
+    PATH
+    test/stats/accuracy.cu
+    test/stats/adjusted_rand_index.cu
+    test/stats/completeness_score.cu
+    test/stats/contingencyMatrix.cu
+    test/stats/cov.cu
+    test/stats/dispersion.cu
+    test/stats/entropy.cu
+    test/stats/histogram.cu
+    test/stats/homogeneity_score.cu
+    test/stats/information_criterion.cu
+    test/stats/kl_divergence.cu
+    test/stats/mean.cu
+    test/stats/meanvar.cu
+    test/stats/mean_center.cu
+    test/stats/minmax.cu
+    test/stats/mutual_info_score.cu
+    test/stats/r2_score.cu
+    test/stats/rand_index.cu
+    test/stats/regression_metrics.cu
+    test/stats/silhouette_score.cu
+    test/stats/stddev.cu
+    test/stats/sum.cu
+    test/stats/trustworthiness.cu
+    test/stats/weighted_mean.cu
+    test/stats/v_measure.cu
+    OPTIONAL
+    DIST
+    NN
+  )
 
-    ConfigureTest(NAME UTILS_TEST
-            PATH
-            test/common/seive.cu
-            test/cudart_utils.cpp
-            test/device_atomics.cu
-            test/integer_utils.cpp
-            test/pow2_utils.cu
-    )
+  ConfigureTest(
+    NAME UTILS_TEST PATH test/common/seive.cu test/cudart_utils.cpp test/device_atomics.cu
+    test/integer_utils.cpp test/pow2_utils.cu
+  )
 endif()
diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu
index 8044dbb532..54a59d6251 100644
--- a/cpp/test/common/seive.cu
+++ b/cpp/test/common/seive.cu
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/common/seive.hpp>
+#include <raft/util/seive.hpp>
 
 namespace raft {
 namespace common {
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index 8c47372c4f..7e8585c7c7 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -60,7 +60,7 @@ TEST(Raft, Utils)
     std::string msg_full{e.what()};
     // only use first line
     std::string msg = msg_full.substr(0, msg_full.find('\n'));
-    std::string re_exp{"^exception occured! file="};
+    std::string re_exp{"^exception occurred! file="};
     re_exp += reg_file;
     // test code must be at line >10 (copyright), assume line is never >9999
     re_exp += " line=\\d{2,4}: ";
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 72906af1b2..f3f36b4576 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -26,7 +26,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(bool* dist,
+__global__ void naiveDistanceAdjKernel(uint8_t* dist,
                                        const DataType* x,
                                        const DataType* y,
                                        int m,
@@ -50,7 +50,7 @@ __global__ void naiveDistanceAdjKernel(bool* dist,
 }
 
 template <typename DataType>
-void naiveDistanceAdj(bool* dist,
+void naiveDistanceAdj(uint8_t* dist,
                       const DataType* x,
                       const DataType* y,
                       int m,
@@ -74,6 +74,18 @@ struct DistanceAdjInputs {
   unsigned long long int seed;
 };
 
+template <typename AccT, typename DataT, typename OutT, typename Index>
+struct threshold_final_op {
+  DataT threshold_val;
+
+  __device__ __host__ threshold_final_op() noexcept : threshold_val(0.0) {}
+  __device__ __host__ threshold_final_op(DataT val) noexcept : threshold_val(val) {}
+  __device__ __host__ OutT operator()(AccT d_val, Index g_idx) const noexcept
+  {
+    return d_val <= threshold_val;
+  }
+};
+
 template <typename DataType>
 ::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
 {
@@ -109,25 +121,28 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
 
     naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor, stream);
     size_t worksize = raft::distance::
-      getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
+      getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, uint8_t>(
         x.data(), y.data(), m, n, k);
     rmm::device_uvector<char> workspace(worksize, stream);
 
-    auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
-      return d_val <= threshold;
-    };
-    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
-      x.data(),
-      y.data(),
-      dist.data(),
-      m,
-      n,
-      k,
-      workspace.data(),
-      workspace.size(),
-      fin_op,
-      stream,
-      isRowMajor);
+    using threshold_final_op_ = threshold_final_op<DataType, DataType, uint8_t, int>;
+    threshold_final_op_ threshold_op(threshold);
+
+    raft::distance::distance<raft::distance::DistanceType::L2Expanded,
+                             DataType,
+                             DataType,
+                             uint8_t,
+                             threshold_final_op_>(x.data(),
+                                                  y.data(),
+                                                  dist.data(),
+                                                  m,
+                                                  n,
+                                                  k,
+                                                  workspace.data(),
+                                                  workspace.size(),
+                                                  threshold_op,
+                                                  stream,
+                                                  isRowMajor);
     handle.sync_stream(stream);
   }
 
@@ -135,8 +150,12 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
 
  protected:
   DistanceAdjInputs<DataType> params;
-  rmm::device_uvector<bool> dist_ref;
-  rmm::device_uvector<bool> dist;
+  // We use uint8_t even if the output in this test is a bool because
+  // cutlass doesn't support bool as output buffer yet. In cuda
+  // sizeof(bool) is 1 byte hence it doesn't increase
+  // memory consumption if we use uint8_t instead of bool.
+  rmm::device_uvector<uint8_t> dist_ref;
+  rmm::device_uvector<uint8_t> dist;
   raft::handle_t handle;
   cudaStream_t stream;
 };
@@ -156,7 +175,7 @@ TEST_P(DistanceAdjTestF, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>(), stream));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<uint8_t>(), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf));
 
@@ -175,7 +194,7 @@ TEST_P(DistanceAdjTestD, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>(), stream));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<uint8_t>(), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index ff142da7fa..5371b8a3e2 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -25,14 +25,17 @@ class DistanceEucExpTest : public DistanceTest<raft::distance::DistanceType::L2E
 };
 
 const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 2048, 4096, 128, true, 1234ULL},
   {0.001f, 1024, 1024, 32, true, 1234ULL},
   {0.001f, 1024, 32, 1024, true, 1234ULL},
   {0.001f, 32, 1024, 1024, true, 1234ULL},
   {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.003f, 1021, 1021, 1021, true, 1234ULL},
   {0.001f, 1024, 1024, 32, false, 1234ULL},
   {0.001f, 1024, 32, 1024, false, 1234ULL},
   {0.001f, 32, 1024, 1024, false, 1234ULL},
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
+  {0.003f, 1021, 1021, 1021, false, 1234ULL},
 };
 typedef DistanceEucExpTest<float> DistanceEucExpTestF;
 TEST_P(DistanceEucExpTestF, Result)
diff --git a/cpp/test/distance/dist_eucsqrt_exp.cu b/cpp/test/distance/dist_eucsqrt_exp.cu
new file mode 100644
index 0000000000..c4f2dc80c2
--- /dev/null
+++ b/cpp/test/distance/dist_eucsqrt_exp.cu
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceEucSqrtExpTest
+  : public DistanceTest<raft::distance::DistanceType::L2SqrtExpanded, DataType> {
+};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 2048, 4096, 128, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.003f, 1021, 1021, 1021, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+  {0.003f, 1021, 1021, 1021, false, 1234ULL},
+};
+typedef DistanceEucSqrtExpTest<float> DistanceEucSqrtExpTestF;
+TEST_P(DistanceEucSqrtExpTestF, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucSqrtExpTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceEucSqrtExpTest<double> DistanceEucSqrtExpTestD;
+TEST_P(DistanceEucSqrtExpTestD, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucSqrtExpTestD, ::testing::ValuesIn(inputsd));
+
+class BigMatrixEucSqrtExp
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::L2SqrtExpanded> {
+};
+TEST_F(BigMatrixEucSqrtExp, Result) {}
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 2b1ae5f9ec..800f45c7fc 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -25,7 +25,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #endif
 
 namespace raft {
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index cf7215bddb..168e3d93f8 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -15,7 +15,7 @@
  */
 
 #if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #endif
 
 #include "../test_utils.h"
@@ -91,7 +91,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
     if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
     if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
     if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
-    // Derive the size of the ouptut from the offset of the last element.
+    // Derive the size of the output from the offset of the last element.
     size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
     x1.resize(size, stream);
     size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
@@ -186,4 +186,4 @@ typedef GramMatrixTest<double> GramMatrixTestDouble;
 TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
 
 INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
-};  // end namespace raft::distance::kernels
\ No newline at end of file
+};  // end namespace raft::distance::kernels
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index 117e7f5f7e..46fa8d348d 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/integer_utils.h>
+#include <raft/util/integer_utils.hpp>
 
 namespace raft {
 
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index cc2acef565..791537b430 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -70,11 +70,31 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
-    naiveCoalescedReduction(dots_exp.data(), data.data(), cols, rows, stream);
 
-    // Perform reduction with default inplace = false first
+    // Perform reduction with default inplace = false first and inplace = true next
+
+    naiveCoalescedReduction(dots_exp.data(),
+                            data.data(),
+                            cols,
+                            rows,
+                            stream,
+                            T(0),
+                            false,
+                            raft::L2Op<T, int>{},
+                            raft::Sum<T>{},
+                            raft::Nop<T>{});
+    naiveCoalescedReduction(dots_exp.data(),
+                            data.data(),
+                            cols,
+                            rows,
+                            stream,
+                            T(0),
+                            true,
+                            raft::L2Op<T, int>{},
+                            raft::Sum<T>{},
+                            raft::Nop<T>{});
+
     coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows);
-    // Add to result with inplace = true next
     coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows, true);
 
     handle.sync_stream(stream);
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 5243f2435f..f0b8d3bb55 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -19,22 +19,23 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
 
 namespace raft {
 namespace linalg {
 
-template <typename T>
+template <typename T, typename IdxT>
 struct NormInputs {
   T tolerance;
-  int rows, cols;
+  IdxT rows, cols;
   NormType type;
   bool do_sqrt;
   bool rowMajor;
   unsigned long long int seed;
 };
 
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const NormInputs<T, IdxT>& I)
 {
   os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", "
      << I.do_sqrt << ", " << I.seed << '}' << std::endl;
@@ -42,14 +43,14 @@ template <typename T>
 }
 
 ///// Row-wise norm test definitions
-template <typename Type>
+template <typename Type, typename IdxT>
 __global__ void naiveRowNormKernel(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
 {
-  Type acc     = (Type)0;
-  int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
+  Type acc      = (Type)0;
+  IdxT rowStart = threadIdx.x + static_cast<IdxT>(blockIdx.x) * blockDim.x;
   if (rowStart < N) {
-    for (int i = 0; i < D; ++i) {
+    for (IdxT i = 0; i < D; ++i) {
       if (type == L2Norm) {
         acc += data[rowStart * D + i] * data[rowStart * D + i];
       } else {
@@ -60,21 +61,21 @@ __global__ void naiveRowNormKernel(
   }
 }
 
-template <typename Type>
+template <typename Type, typename IdxT>
 void naiveRowNorm(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt, cudaStream_t stream)
 {
-  static const int TPB = 64;
-  int nblks            = raft::ceildiv(N, TPB);
+  static const IdxT TPB = 64;
+  IdxT nblks            = raft::ceildiv(N, TPB);
   naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename T>
-class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
+template <typename T, typename IdxT>
+class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
  public:
   RowNormTest()
-    : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+    : params(::testing::TestWithParam<NormInputs<T, IdxT>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.rows, stream),
@@ -85,13 +86,13 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
   void SetUp() override
   {
     raft::random::RngState r(params.seed);
-    int rows = params.rows, cols = params.cols, len = rows * cols;
+    IdxT rows = params.rows, cols = params.cols, len = rows * cols;
     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
     naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
-    auto output_view     = raft::make_device_vector_view<T, int>(dots_act.data(), params.rows);
-    auto input_row_major = raft::make_device_matrix_view<const T, int, raft::row_major>(
+    auto output_view     = raft::make_device_vector_view<T, IdxT>(dots_act.data(), params.rows);
+    auto input_row_major = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
       data.data(), params.rows, params.cols);
-    auto input_col_major = raft::make_device_matrix_view<const T, int, raft::col_major>(
+    auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(const T in) { return raft::mySqrt(in); };
@@ -114,20 +115,20 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
   raft::handle_t handle;
   cudaStream_t stream;
 
-  NormInputs<T> params;
+  NormInputs<T, IdxT> params;
   rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
 ///// Column-wise norm test definitisons
-template <typename Type>
+template <typename Type, typename IdxT>
 __global__ void naiveColNormKernel(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt)
 {
-  int colID = threadIdx.x + blockIdx.x * blockDim.x;
-  if (colID > D) return;  // avoid out-of-bounds thread
+  IdxT colID = threadIdx.x + static_cast<IdxT>(blockIdx.x) * blockDim.x;
+  if (colID >= D) return;  // avoid out-of-bounds thread
 
   Type acc = 0;
-  for (int i = 0; i < N; i++) {
+  for (IdxT i = 0; i < N; i++) {
     Type v = data[colID + i * D];
     acc += type == L2Norm ? v * v : raft::myAbs(v);
   }
@@ -135,21 +136,21 @@ __global__ void naiveColNormKernel(
   dots[colID] = do_sqrt ? raft::mySqrt(acc) : acc;
 }
 
-template <typename Type>
+template <typename Type, typename IdxT>
 void naiveColNorm(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+  Type* dots, const Type* data, IdxT D, IdxT N, NormType type, bool do_sqrt, cudaStream_t stream)
 {
-  static const int TPB = 64;
-  int nblks            = raft::ceildiv(D, TPB);
+  static const IdxT TPB = 64;
+  IdxT nblks            = raft::ceildiv(D, TPB);
   naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename T>
-class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
+template <typename T, typename IdxT>
+class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
  public:
   ColNormTest()
-    : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+    : params(::testing::TestWithParam<NormInputs<T, IdxT>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.cols, stream),
@@ -160,14 +161,14 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
   void SetUp() override
   {
     raft::random::RngState r(params.seed);
-    int rows = params.rows, cols = params.cols, len = rows * cols;
+    IdxT rows = params.rows, cols = params.cols, len = rows * cols;
     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
 
     naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
-    auto output_view     = raft::make_device_vector_view<T, int>(dots_act.data(), params.cols);
-    auto input_row_major = raft::make_device_matrix_view<const T, int, raft::row_major>(
+    auto output_view     = raft::make_device_vector_view<T, IdxT>(dots_act.data(), params.cols);
+    auto input_row_major = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
       data.data(), params.rows, params.cols);
-    auto input_col_major = raft::make_device_matrix_view<const T, int, raft::col_major>(
+    auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(const T in) { return raft::mySqrt(in); };
@@ -190,121 +191,81 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
   raft::handle_t handle;
   cudaStream_t stream;
 
-  NormInputs<T> params;
+  NormInputs<T, IdxT> params;
   rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
 ///// Row- and column-wise tests
-const std::vector<NormInputs<float>> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
-
-                                                {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
-
-const std::vector<NormInputs<double>> inputsd = {
-  {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 64, L1Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 128, L1Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 256, L1Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 32, L2Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 64, L2Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 128, L2Norm, false, true, 1234ULL},
-  {0.00000001, 1024, 256, L2Norm, false, true, 1234ULL},
-
-  {0.00000001, 1024, 32, L1Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 64, L1Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 128, L1Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 256, L1Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 32, L2Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 64, L2Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 128, L2Norm, true, true, 1234ULL},
-  {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}};
-
-typedef RowNormTest<float> RowNormTestF;
-TEST_P(RowNormTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<float>(params.tolerance)));
-}
-
-typedef RowNormTest<double> RowNormTestD;
-TEST_P(RowNormTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<double>(params.tolerance)));
-}
-
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf));
-
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd));
-
-const std::vector<NormInputs<float>> inputscf = {
-  {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL},
-  {0.00001f, 64, 1024, L1Norm, false, true, 1234ULL},
-  {0.00001f, 128, 1024, L1Norm, false, true, 1234ULL},
-  {0.00001f, 256, 1024, L1Norm, false, true, 1234ULL},
-  {0.00001f, 32, 1024, L2Norm, false, true, 1234ULL},
-  {0.00001f, 64, 1024, L2Norm, false, true, 1234ULL},
-  {0.00001f, 128, 1024, L2Norm, false, true, 1234ULL},
-  {0.00001f, 256, 1024, L2Norm, false, true, 1234ULL},
-
-  {0.00001f, 32, 1024, L1Norm, true, true, 1234ULL},
-  {0.00001f, 64, 1024, L1Norm, true, true, 1234ULL},
-  {0.00001f, 128, 1024, L1Norm, true, true, 1234ULL},
-  {0.00001f, 256, 1024, L1Norm, true, true, 1234ULL},
-  {0.00001f, 32, 1024, L2Norm, true, true, 1234ULL},
-  {0.00001f, 64, 1024, L2Norm, true, true, 1234ULL},
-  {0.00001f, 128, 1024, L2Norm, true, true, 1234ULL},
-  {0.00001f, 256, 1024, L2Norm, true, true, 1234ULL}};
-
-const std::vector<NormInputs<double>> inputscd = {
-  {0.00000001, 32, 1024, L1Norm, false, true, 1234ULL},
-  {0.00000001, 64, 1024, L1Norm, false, true, 1234ULL},
-  {0.00000001, 128, 1024, L1Norm, false, true, 1234ULL},
-  {0.00000001, 256, 1024, L1Norm, false, true, 1234ULL},
-  {0.00000001, 32, 1024, L2Norm, false, true, 1234ULL},
-  {0.00000001, 64, 1024, L2Norm, false, true, 1234ULL},
-  {0.00000001, 128, 1024, L2Norm, false, true, 1234ULL},
-  {0.00000001, 256, 1024, L2Norm, false, true, 1234ULL},
-
-  {0.00000001, 32, 1024, L1Norm, true, true, 1234ULL},
-  {0.00000001, 64, 1024, L1Norm, true, true, 1234ULL},
-  {0.00000001, 128, 1024, L1Norm, true, true, 1234ULL},
-  {0.00000001, 256, 1024, L1Norm, true, true, 1234ULL},
-  {0.00000001, 32, 1024, L2Norm, true, true, 1234ULL},
-  {0.00000001, 64, 1024, L2Norm, true, true, 1234ULL},
-  {0.00000001, 128, 1024, L2Norm, true, true, 1234ULL},
-  {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}};
-
-typedef ColNormTest<float> ColNormTestF;
-TEST_P(ColNormTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
-}
-
-typedef ColNormTest<double> ColNormTestD;
-TEST_P(ColNormTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
-}
-
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf));
-
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd));
+const std::vector<NormInputs<float, int>> inputsf_i32 =
+  raft::util::itertools::product<NormInputs<float, int>>(
+    {0.00001f}, {11, 1234}, {7, 33, 128, 500}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<double, int>> inputsd_i32 =
+  raft::util::itertools::product<NormInputs<double, int>>({0.00000001},
+                                                          {11, 1234},
+                                                          {7, 33, 128, 500},
+                                                          {L1Norm, L2Norm},
+                                                          {false, true},
+                                                          {true},
+                                                          {1234ULL});
+const std::vector<NormInputs<float, int64_t>> inputsf_i64 =
+  raft::util::itertools::product<NormInputs<float, int64_t>>(
+    {0.00001f}, {11, 1234}, {7, 33, 128, 500}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<double, int64_t>> inputsd_i64 =
+  raft::util::itertools::product<NormInputs<double, int64_t>>({0.00000001},
+                                                              {11, 1234},
+                                                              {7, 33, 128, 500},
+                                                              {L1Norm, L2Norm},
+                                                              {false, true},
+                                                              {true},
+                                                              {1234ULL});
+const std::vector<NormInputs<float, int>> inputscf_i32 =
+  raft::util::itertools::product<NormInputs<float, int>>(
+    {0.00001f}, {7, 33, 128, 500}, {11, 1234}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<double, int>> inputscd_i32 =
+  raft::util::itertools::product<NormInputs<double, int>>({0.00000001},
+                                                          {7, 33, 128, 500},
+                                                          {11, 1234},
+                                                          {L1Norm, L2Norm},
+                                                          {false, true},
+                                                          {true},
+                                                          {1234ULL});
+const std::vector<NormInputs<float, int64_t>> inputscf_i64 =
+  raft::util::itertools::product<NormInputs<float, int64_t>>(
+    {0.00001f}, {7, 33, 128, 500}, {11, 1234}, {L1Norm, L2Norm}, {false, true}, {true}, {1234ULL});
+const std::vector<NormInputs<double, int64_t>> inputscd_i64 =
+  raft::util::itertools::product<NormInputs<double, int64_t>>({0.00000001},
+                                                              {7, 33, 128, 500},
+                                                              {11, 1234},
+                                                              {L1Norm, L2Norm},
+                                                              {false, true},
+                                                              {true},
+                                                              {1234ULL});
+
+typedef RowNormTest<float, int> RowNormTestF_i32;
+typedef RowNormTest<double, int> RowNormTestD_i32;
+typedef RowNormTest<float, int64_t> RowNormTestF_i64;
+typedef RowNormTest<double, int64_t> RowNormTestD_i64;
+typedef ColNormTest<float, int> ColNormTestF_i32;
+typedef ColNormTest<double, int> ColNormTestD_i32;
+typedef ColNormTest<float, int64_t> ColNormTestF_i64;
+typedef ColNormTest<double, int64_t> ColNormTestD_i64;
+
+#define ROWNORM_TEST(test_type, test_inputs)                                                      \
+  TEST_P(test_type, Result)                                                                       \
+  {                                                                                               \
+    ASSERT_TRUE(raft::devArrMatch(                                                                \
+      dots_exp.data(), dots_act.data(), dots_exp.size(), raft::CompareApprox(params.tolerance))); \
+  }                                                                                               \
+  INSTANTIATE_TEST_CASE_P(RowNormTests, test_type, ::testing::ValuesIn(test_inputs))
+
+ROWNORM_TEST(RowNormTestF_i32, inputsf_i32);
+ROWNORM_TEST(RowNormTestD_i32, inputsd_i32);
+ROWNORM_TEST(RowNormTestF_i64, inputsf_i64);
+ROWNORM_TEST(RowNormTestD_i64, inputsd_i64);
+ROWNORM_TEST(ColNormTestF_i32, inputscf_i32);
+ROWNORM_TEST(ColNormTestD_i32, inputscd_i32);
+ROWNORM_TEST(ColNormTestF_i64, inputscf_i64);
+ROWNORM_TEST(ColNormTestD_i64, inputscd_i64);
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/normalize.cu b/cpp/test/linalg/normalize.cu
new file mode 100644
index 0000000000..cb949b6a5d
--- /dev/null
+++ b/cpp/test/linalg/normalize.cu
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/normalize.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename T, typename IdxT>
+struct RowNormalizeInputs {
+  T tolerance;
+  IdxT rows, cols;
+  raft::linalg::NormType norm_type;
+  unsigned long long int seed;
+};
+
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const RowNormalizeInputs<T, IdxT>& I)
+{
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.norm_type << ", "
+     << I.seed << '}' << std::endl;
+  return os;
+}
+
+template <typename T, typename IdxT>
+void rowNormalizeRef(
+  T* out, const T* in, IdxT cols, IdxT rows, raft::linalg::NormType norm_type, cudaStream_t stream)
+{
+  rmm::device_uvector<T> norm(rows, stream);
+  if (norm_type == raft::linalg::L2Norm) {
+    raft::linalg::rowNorm(norm.data(), in, cols, rows, norm_type, true, stream, raft::SqrtOp<T>());
+  } else {
+    raft::linalg::rowNorm(norm.data(), in, cols, rows, norm_type, true, stream, raft::Nop<T>());
+  }
+  raft::linalg::matrixVectorOp(
+    out,
+    in,
+    norm.data(),
+    cols,
+    rows,
+    true,
+    false,
+    [] __device__(T a, T b) { return a / b; },
+    stream);
+}
+
+template <typename T, typename IdxT>
+class RowNormalizeTest : public ::testing::TestWithParam<RowNormalizeInputs<T, IdxT>> {
+ public:
+  RowNormalizeTest()
+    : params(::testing::TestWithParam<RowNormalizeInputs<T, IdxT>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      out_exp(params.rows * params.cols, stream),
+      out_act(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    int len = params.rows * params.cols;
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+
+    rowNormalizeRef(
+      out_exp.data(), data.data(), params.cols, params.rows, params.norm_type, stream);
+
+    auto input_view = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
+      data.data(), params.rows, params.cols);
+    auto output_view = raft::make_device_matrix_view<T, IdxT, raft::row_major>(
+      out_act.data(), params.rows, params.cols);
+    raft::linalg::row_normalize(handle, input_view, output_view, params.norm_type);
+
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  RowNormalizeInputs<T, IdxT> params;
+  rmm::device_uvector<T> data, out_exp, out_act;
+};
+
+const std::vector<RowNormalizeInputs<float, int>> inputsf_i32 =
+  raft::util::itertools::product<RowNormalizeInputs<float, int>>(
+    {0.00001f},
+    {11, 101, 12345},
+    {2, 3, 7, 12, 33, 125, 254},
+    {raft::linalg::L1Norm, raft::linalg::L2Norm, raft::linalg::LinfNorm},
+    {1234ULL});
+const std::vector<RowNormalizeInputs<double, int>> inputsd_i32 =
+  raft::util::itertools::product<RowNormalizeInputs<double, int>>(
+    {0.00000001},
+    {11, 101, 12345},
+    {2, 3, 7, 12, 33, 125, 254},
+    {raft::linalg::L1Norm, raft::linalg::L2Norm, raft::linalg::LinfNorm},
+    {1234ULL});
+const std::vector<RowNormalizeInputs<float, uint32_t>> inputsf_u32 =
+  raft::util::itertools::product<RowNormalizeInputs<float, uint32_t>>(
+    {0.00001f},
+    {11u, 101u, 12345u},
+    {2u, 3u, 7u, 12u, 33u, 125u, 254u},
+    {raft::linalg::L1Norm, raft::linalg::L2Norm, raft::linalg::LinfNorm},
+    {1234ULL});
+const std::vector<RowNormalizeInputs<double, uint32_t>> inputsd_u32 =
+  raft::util::itertools::product<RowNormalizeInputs<double, uint32_t>>(
+    {0.00000001},
+    {11u, 101u, 12345u},
+    {2u, 3u, 7u, 12u, 33u, 125u, 254u},
+    {raft::linalg::L1Norm, raft::linalg::L2Norm, raft::linalg::LinfNorm},
+    {1234ULL});
+
+#define ROWNORMALIZE_TEST(test_type, test_name, test_inputs)               \
+  typedef RAFT_DEPAREN(test_type) test_name;                               \
+  TEST_P(test_name, Result)                                                \
+  {                                                                        \
+    ASSERT_TRUE(raft::devArrMatch(out_exp.data(),                          \
+                                  out_act.data(),                          \
+                                  params.rows* params.cols,                \
+                                  raft::CompareApprox(params.tolerance))); \
+  }                                                                        \
+  INSTANTIATE_TEST_CASE_P(RowNormalizeTests, test_name, ::testing::ValuesIn(test_inputs))
+
+ROWNORMALIZE_TEST((RowNormalizeTest<float, int>), RowNormalizeTestFI32, inputsf_i32);
+ROWNORMALIZE_TEST((RowNormalizeTest<double, int>), RowNormalizeTestDI32, inputsd_i32);
+ROWNORMALIZE_TEST((RowNormalizeTest<float, uint32_t>), RowNormalizeTestFU32, inputsf_u32);
+ROWNORMALIZE_TEST((RowNormalizeTest<double, uint32_t>), RowNormalizeTestDU32, inputsd_u32);
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 57654f88ab..00f3810d28 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -17,79 +17,97 @@
 #include "../test_utils.h"
 #include "reduce.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/detail/macros.hpp>
 #include <raft/linalg/reduce.cuh>
-#include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
 
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, typename IdxType>
 struct ReduceInputs {
   OutType tolerance;
-  int rows, cols;
+  IdxType rows, cols;
   bool rowMajor, alongRows;
+  OutType init;
   unsigned long long int seed;
 };
 
-template <typename InType, typename OutType>
-::std::ostream& operator<<(::std::ostream& os, const ReduceInputs<InType, OutType>& dims)
+template <typename InType, typename OutType, typename IdxType>
+::std::ostream& operator<<(::std::ostream& os, const ReduceInputs<InType, OutType, IdxType>& dims)
 {
+  os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " << dims.rowMajor
+     << ", " << dims.alongRows << ", " << dims.init << " " << dims.seed << '}';
   return os;
 }
 
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename InType, typename OutType>
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
 void reduceLaunch(OutType* dots,
                   const InType* data,
-                  int cols,
-                  int rows,
+                  IdxType cols,
+                  IdxType rows,
                   bool rowMajor,
                   bool alongRows,
+                  OutType init,
                   bool inplace,
-                  cudaStream_t stream)
+                  cudaStream_t stream,
+                  MainLambda main_op,
+                  ReduceLambda reduce_op,
+                  FinalLambda final_op)
 {
-  Apply apply     = alongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
-  int output_size = alongRows ? cols : rows;
+  Apply apply         = alongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
+  IdxType output_size = alongRows ? cols : rows;
 
-  auto output_view_row_major = raft::make_device_vector_view(dots, output_size);
-  auto input_view_row_major  = raft::make_device_matrix_view(data, rows, cols);
-
-  auto output_view_col_major = raft::make_device_vector_view<OutType, uint32_t>(dots, output_size);
+  auto output_view          = raft::make_device_vector_view(dots, output_size);
+  auto input_view_row_major = raft::make_device_matrix_view(data, rows, cols);
   auto input_view_col_major =
-    raft::make_device_matrix_view<const InType, uint32_t, raft::col_major>(data, rows, cols);
+    raft::make_device_matrix_view<const InType, IdxType, raft::col_major>(data, rows, cols);
 
   raft::handle_t handle{stream};
 
   if (rowMajor) {
     reduce(handle,
            input_view_row_major,
-           output_view_row_major,
-           (OutType)0,
-
+           output_view,
+           init,
            apply,
            inplace,
-           [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
+           main_op,
+           reduce_op,
+           final_op);
   } else {
     reduce(handle,
            input_view_col_major,
-           output_view_col_major,
-           (OutType)0,
-
+           output_view,
+           init,
            apply,
            inplace,
-           [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
+           main_op,
+           reduce_op,
+           final_op);
   }
 }
 
-template <typename InType, typename OutType>
-class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda   = raft::L2Op<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::SqrtOp<InType>>
+class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType, IdxType>> {
  public:
   ReduceTest()
-    : params(::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<ReduceInputs<InType, OutType, IdxType>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.alongRows ? params.rows : params.cols, stream),
@@ -101,22 +119,66 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>
   void SetUp() override
   {
     raft::random::RngState r(params.seed);
-    int rows = params.rows, cols = params.cols;
-    int len = rows * cols;
-    outlen  = params.alongRows ? rows : cols;
-    uniform(handle, r, data.data(), len, InType(-1.0), InType(1.0));
-    naiveReduction(
-      dots_exp.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, stream);
-
-    // Perform reduction with default inplace = false first
-    reduceLaunch(
-      dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, false, stream);
-    // Add to result with inplace = true next, which shouldn't affect
-    // in the case of coalescedReduction!
-    if (!(params.rowMajor ^ params.alongRows)) {
-      reduceLaunch(
-        dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream);
-    }
+    IdxType rows = params.rows, cols = params.cols;
+    IdxType len = rows * cols;
+    gen_uniform(data.data(), r, len, stream);
+
+    MainLambda main_op;
+    ReduceLambda reduce_op;
+    FinalLambda fin_op;
+
+    // For both the naive and the actual implementation, execute first with inplace=false then true
+
+    naiveReduction(dots_exp.data(),
+                   data.data(),
+                   cols,
+                   rows,
+                   params.rowMajor,
+                   params.alongRows,
+                   stream,
+                   params.init,
+                   false,
+                   main_op,
+                   reduce_op,
+                   fin_op);
+    naiveReduction(dots_exp.data(),
+                   data.data(),
+                   cols,
+                   rows,
+                   params.rowMajor,
+                   params.alongRows,
+                   stream,
+                   params.init,
+                   true,
+                   main_op,
+                   reduce_op,
+                   fin_op);
+
+    reduceLaunch(dots_act.data(),
+                 data.data(),
+                 cols,
+                 rows,
+                 params.rowMajor,
+                 params.alongRows,
+                 params.init,
+                 false,
+                 stream,
+                 main_op,
+                 reduce_op,
+                 fin_op);
+    reduceLaunch(dots_act.data(),
+                 data.data(),
+                 cols,
+                 rows,
+                 params.rowMajor,
+                 params.alongRows,
+                 params.init,
+                 true,
+                 stream,
+                 main_op,
+                 reduce_op,
+                 fin_op);
+
     handle.sync_stream(stream);
   }
 
@@ -124,92 +186,140 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>
   raft::handle_t handle;
   cudaStream_t stream;
 
-  ReduceInputs<InType, OutType> params;
+  ReduceInputs<InType, OutType, IdxType> params;
   rmm::device_uvector<InType> data;
   rmm::device_uvector<OutType> dots_exp, dots_act;
-  int outlen;
 };
 
-const std::vector<ReduceInputs<float, float>> inputsff = {
-  {0.000002f, 1024, 32, true, true, 1234ULL},
-  {0.000002f, 1024, 64, true, true, 1234ULL},
-  {0.000002f, 1024, 128, true, true, 1234ULL},
-  {0.000002f, 1024, 256, true, true, 1234ULL},
-  {0.000002f, 1024, 32, true, false, 1234ULL},
-  {0.000002f, 1024, 64, true, false, 1234ULL},
-  {0.000002f, 1024, 128, true, false, 1234ULL},
-  {0.000002f, 1024, 256, true, false, 1234ULL},
-  {0.000002f, 1024, 32, false, true, 1234ULL},
-  {0.000002f, 1024, 64, false, true, 1234ULL},
-  {0.000002f, 1024, 128, false, true, 1234ULL},
-  {0.000002f, 1024, 256, false, true, 1234ULL},
-  {0.000002f, 1024, 32, false, false, 1234ULL},
-  {0.000002f, 1024, 64, false, false, 1234ULL},
-  {0.000002f, 1024, 128, false, false, 1234ULL},
-  {0.000002f, 1024, 256, false, false, 1234ULL}};
-
-const std::vector<ReduceInputs<double, double>> inputsdd = {
-  {0.000000001, 1024, 32, true, true, 1234ULL},
-  {0.000000001, 1024, 64, true, true, 1234ULL},
-  {0.000000001, 1024, 128, true, true, 1234ULL},
-  {0.000000001, 1024, 256, true, true, 1234ULL},
-  {0.000000001, 1024, 32, true, false, 1234ULL},
-  {0.000000001, 1024, 64, true, false, 1234ULL},
-  {0.000000001, 1024, 128, true, false, 1234ULL},
-  {0.000000001, 1024, 256, true, false, 1234ULL},
-  {0.000000001, 1024, 32, false, true, 1234ULL},
-  {0.000000001, 1024, 64, false, true, 1234ULL},
-  {0.000000001, 1024, 128, false, true, 1234ULL},
-  {0.000000001, 1024, 256, false, true, 1234ULL},
-  {0.000000001, 1024, 32, false, false, 1234ULL},
-  {0.000000001, 1024, 64, false, false, 1234ULL},
-  {0.000000001, 1024, 128, false, false, 1234ULL},
-  {0.000000001, 1024, 256, false, false, 1234ULL}};
-
-const std::vector<ReduceInputs<float, double>> inputsfd = {
-  {0.000002f, 1024, 32, true, true, 1234ULL},
-  {0.000002f, 1024, 64, true, true, 1234ULL},
-  {0.000002f, 1024, 128, true, true, 1234ULL},
-  {0.000002f, 1024, 256, true, true, 1234ULL},
-  {0.000002f, 1024, 32, true, false, 1234ULL},
-  {0.000002f, 1024, 64, true, false, 1234ULL},
-  {0.000002f, 1024, 128, true, false, 1234ULL},
-  {0.000002f, 1024, 256, true, false, 1234ULL},
-  {0.000002f, 1024, 32, false, true, 1234ULL},
-  {0.000002f, 1024, 64, false, true, 1234ULL},
-  {0.000002f, 1024, 128, false, true, 1234ULL},
-  {0.000002f, 1024, 256, false, true, 1234ULL},
-  {0.000002f, 1024, 32, false, false, 1234ULL},
-  {0.000002f, 1024, 64, false, false, 1234ULL},
-  {0.000002f, 1024, 128, false, false, 1234ULL},
-  {0.000002f, 1024, 256, false, false, 1234ULL}};
-
-typedef ReduceTest<float, float> ReduceTestFF;
-TEST_P(ReduceTestFF, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<float>(params.tolerance)));
-}
+#define REDUCE_TEST(test_type, test_name, test_inputs)                                            \
+  typedef RAFT_DEPAREN(test_type) test_name;                                                      \
+  TEST_P(test_name, Result)                                                                       \
+  {                                                                                               \
+    ASSERT_TRUE(raft::devArrMatch(                                                                \
+      dots_exp.data(), dots_act.data(), dots_exp.size(), raft::CompareApprox(params.tolerance))); \
+  }                                                                                               \
+  INSTANTIATE_TEST_CASE_P(ReduceTests, test_name, ::testing::ValuesIn(test_inputs))
 
-typedef ReduceTest<double, double> ReduceTestDD;
-TEST_P(ReduceTestDD, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<double>(params.tolerance)));
-}
+const std::vector<ReduceInputs<float, float, int>> inputsff_i32 =
+  raft::util::itertools::product<ReduceInputs<float, float, int>>(
+    {0.000002f}, {11, 1234}, {7, 33, 128, 500}, {true, false}, {true, false}, {0.0f}, {1234ULL});
+const std::vector<ReduceInputs<double, double, int>> inputsdd_i32 =
+  raft::util::itertools::product<ReduceInputs<double, double, int>>(
+    {0.000000001}, {11, 1234}, {7, 33, 128, 500}, {true, false}, {true, false}, {0.0}, {1234ULL});
+const std::vector<ReduceInputs<float, double, int>> inputsfd_i32 =
+  raft::util::itertools::product<ReduceInputs<float, double, int>>(
+    {0.000000001}, {11, 1234}, {7, 33, 128, 500}, {true, false}, {true, false}, {0.0f}, {1234ULL});
+const std::vector<ReduceInputs<float, float, uint32_t>> inputsff_u32 =
+  raft::util::itertools::product<ReduceInputs<float, float, uint32_t>>({0.000002f},
+                                                                       {11u, 1234u},
+                                                                       {7u, 33u, 128u, 500u},
+                                                                       {true, false},
+                                                                       {true, false},
+                                                                       {0.0f},
+                                                                       {1234ULL});
+const std::vector<ReduceInputs<float, float, int64_t>> inputsff_i64 =
+  raft::util::itertools::product<ReduceInputs<float, float, int64_t>>(
+    {0.000002f}, {11, 1234}, {7, 33, 128, 500}, {true, false}, {true, false}, {0.0f}, {1234ULL});
 
-typedef ReduceTest<float, double> ReduceTestFD;
-TEST_P(ReduceTestFD, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<double>(params.tolerance)));
-}
+REDUCE_TEST((ReduceTest<float, float, int>), ReduceTestFFI32, inputsff_i32);
+REDUCE_TEST((ReduceTest<double, double, int>), ReduceTestDDI32, inputsdd_i32);
+REDUCE_TEST((ReduceTest<float, double, int>), ReduceTestFDI32, inputsfd_i32);
+REDUCE_TEST((ReduceTest<float, float, uint32_t>), ReduceTestFFU32, inputsff_u32);
+REDUCE_TEST((ReduceTest<float, float, int64_t>), ReduceTestFFI64, inputsff_i64);
+
+// The following test cases are for "thick" coalesced reductions
+
+const std::vector<ReduceInputs<float, float, int>> inputsff_thick_i32 =
+  raft::util::itertools::product<ReduceInputs<float, float, int>>(
+    {0.0001f}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+const std::vector<ReduceInputs<double, double, int>> inputsdd_thick_i32 =
+  raft::util::itertools::product<ReduceInputs<double, double, int>>(
+    {0.000001}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0}, {1234ULL});
+const std::vector<ReduceInputs<float, double, int>> inputsfd_thick_i32 =
+  raft::util::itertools::product<ReduceInputs<float, double, int>>(
+    {0.000001}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+const std::vector<ReduceInputs<float, float, uint32_t>> inputsff_thick_u32 =
+  raft::util::itertools::product<ReduceInputs<float, float, uint32_t>>(
+    {0.0001f}, {3u, 9u}, {17771u, 33333u, 100000u}, {true}, {true}, {0.0f}, {1234ULL});
+const std::vector<ReduceInputs<float, float, int64_t>> inputsff_thick_i64 =
+  raft::util::itertools::product<ReduceInputs<float, float, int64_t>>(
+    {0.0001f}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+
+REDUCE_TEST((ReduceTest<float, float, int>), ReduceTestFFI32Thick, inputsff_thick_i32);
+REDUCE_TEST((ReduceTest<double, double, int>), ReduceTestDDI32Thick, inputsdd_thick_i32);
+REDUCE_TEST((ReduceTest<float, double, int>), ReduceTestFDI32Thick, inputsfd_thick_i32);
+REDUCE_TEST((ReduceTest<float, float, uint32_t>), ReduceTestFFU32Thick, inputsff_thick_u32);
+REDUCE_TEST((ReduceTest<float, float, int64_t>), ReduceTestFFI64Thick, inputsff_thick_i64);
+
+// Test key-value-pair reductions. This is important because shuffle intrinsics can't be used
+// directly with those types.
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff));
+template <typename T, typename IdxT = int>
+struct ValueToKVP {
+  HDI raft::KeyValuePair<IdxT, T> operator()(T value, IdxT idx) { return {idx, value}; }
+};
+
+template <typename T1, typename T2>
+struct ArgMaxOp {
+  HDI raft::KeyValuePair<T1, T2> operator()(raft::KeyValuePair<T1, T2> a,
+                                            raft::KeyValuePair<T1, T2> b)
+  {
+    return (a.value > b.value || (a.value == b.value && a.key <= b.key)) ? a : b;
+  }
+};
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd));
+const std::vector<ReduceInputs<short, raft::KeyValuePair<int, short>, int>> inputs_kvpis_i32 =
+  raft::util::itertools::product<ReduceInputs<short, raft::KeyValuePair<int, short>, int>>(
+    {raft::KeyValuePair{0, short(0)}},
+    {11, 1234},
+    {7, 33, 128, 500},
+    {true},
+    {true},
+    {raft::KeyValuePair{0, short(0)}},
+    {1234ULL});
+const std::vector<ReduceInputs<float, raft::KeyValuePair<int, float>, int>> inputs_kvpif_i32 =
+  raft::util::itertools::product<ReduceInputs<float, raft::KeyValuePair<int, float>, int>>(
+    {raft::KeyValuePair{0, 0.0001f}},
+    {11, 1234},
+    {7, 33, 128, 500},
+    {true},
+    {true},
+    {raft::KeyValuePair{0, 0.0f}},
+    {1234ULL});
+const std::vector<ReduceInputs<double, raft::KeyValuePair<int, double>, int>> inputs_kvpid_i32 =
+  raft::util::itertools::product<ReduceInputs<double, raft::KeyValuePair<int, double>, int>>(
+    {raft::KeyValuePair{0, 0.000001}},
+    {11, 1234},
+    {7, 33, 128, 500},
+    {true},
+    {true},
+    {raft::KeyValuePair{0, 0.0}},
+    {1234ULL});
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd));
+REDUCE_TEST((ReduceTest<short,
+                        raft::KeyValuePair<int, short>,
+                        int,
+                        ValueToKVP<short, int>,
+                        ArgMaxOp<int, short>,
+                        raft::Nop<raft::KeyValuePair<int, short>, int>>),
+            ReduceTestKVPISI32,
+            inputs_kvpis_i32);
+REDUCE_TEST((ReduceTest<float,
+                        raft::KeyValuePair<int, float>,
+                        int,
+                        ValueToKVP<float, int>,
+                        ArgMaxOp<int, float>,
+                        raft::Nop<raft::KeyValuePair<int, float>, int>>),
+            ReduceTestKVPIFI32,
+            inputs_kvpif_i32);
+REDUCE_TEST((ReduceTest<double,
+                        raft::KeyValuePair<int, double>,
+                        int,
+                        ValueToKVP<double, int>,
+                        ArgMaxOp<int, double>,
+                        raft::Nop<raft::KeyValuePair<int, double>, int>>),
+            ReduceTestKVPIDI32,
+            inputs_kvpid_i32);
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 162bf9f2c1..0dcffd3f41 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -28,70 +28,141 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType>
-__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N)
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void naiveCoalescedReductionKernel(OutType* dots,
+                                              const InType* data,
+                                              IdxType D,
+                                              IdxType N,
+                                              OutType init,
+                                              bool inplace,
+                                              MainLambda main_op,
+                                              ReduceLambda reduce_op,
+                                              FinalLambda fin_op)
 {
-  OutType acc  = (OutType)0;
-  int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
+  OutType acc      = init;
+  IdxType rowStart = threadIdx.x + static_cast<IdxType>(blockIdx.x) * blockDim.x;
   if (rowStart < N) {
-    for (int i = 0; i < D; ++i) {
-      acc += static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
+    for (IdxType i = 0; i < D; ++i) {
+      acc = reduce_op(acc, main_op(data[rowStart * D + i], i));
+    }
+    if (inplace) {
+      dots[rowStart] = fin_op(reduce_op(dots[rowStart], acc));
+    } else {
+      dots[rowStart] = fin_op(acc);
     }
-    dots[rowStart] = 2 * acc;
   }
 }
 
-template <typename InType, typename OutType>
-void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<InType>>
+void naiveCoalescedReduction(OutType* dots,
+                             const InType* data,
+                             IdxType D,
+                             IdxType N,
+                             cudaStream_t stream,
+                             OutType init,
+                             bool inplace           = false,
+                             MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                             ReduceLambda reduce_op = raft::Sum<OutType>(),
+                             FinalLambda fin_op     = raft::Nop<InType>())
 {
-  static const int TPB = 64;
-  int nblks            = raft::ceildiv(N, TPB);
-  naiveCoalescedReductionKernel<InType, OutType><<<nblks, TPB, 0, stream>>>(dots, data, D, N);
+  static const IdxType TPB = 64;
+  IdxType nblks            = raft::ceildiv(N, TPB);
+  naiveCoalescedReductionKernel<<<nblks, TPB, 0, stream>>>(
+    dots, data, D, N, init, inplace, main_op, reduce_op, fin_op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename InType, typename OutType>
-void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void naiveStridedReductionKernel(OutType* dots,
+                                            const InType* data,
+                                            IdxType D,
+                                            IdxType N,
+                                            OutType init,
+                                            bool inplace,
+                                            MainLambda main_op,
+                                            ReduceLambda reduce_op,
+                                            FinalLambda fin_op)
 {
-  // computes a MLCommon unary op on data (squares it), then computes Ax
-  //(A input matrix and x column vector) to sum columns
-  rmm::device_uvector<OutType> sq(D * N, stream);
-  raft::linalg::unaryOp(
-    thrust::raw_pointer_cast(sq.data()),
-    data,
-    D * N,
-    [] __device__(InType v) { return static_cast<OutType>(v * v); },
-    stream);
-  cublasHandle_t handle;
-  RAFT_CUBLAS_TRY(cublasCreate(&handle));
-  rmm::device_uvector<OutType> ones(N, stream);  // column vector [1...1]
-  raft::linalg::unaryOp<OutType>(
-    ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
-  OutType alpha = 1, beta = 0;
-  // #TODO: Call from public API when ready
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(
-    handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  RAFT_CUBLAS_TRY(cublasDestroy(handle));
+  OutType acc = init;
+  IdxType col = threadIdx.x + static_cast<IdxType>(blockIdx.x) * blockDim.x;
+  if (col < D) {
+    for (IdxType i = 0; i < N; ++i) {
+      acc = reduce_op(acc, main_op(data[i * D + col], i));
+    }
+    if (inplace) {
+      dots[col] = fin_op(reduce_op(dots[col], acc));
+    } else {
+      dots[col] = fin_op(acc);
+    }
+  }
+}
+
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<InType>>
+void naiveStridedReduction(OutType* dots,
+                           const InType* data,
+                           IdxType D,
+                           IdxType N,
+                           cudaStream_t stream,
+                           OutType init,
+                           bool inplace           = false,
+                           MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                           ReduceLambda reduce_op = raft::Sum<OutType>(),
+                           FinalLambda fin_op     = raft::Nop<InType>())
+{
+  static const IdxType TPB = 64;
+  IdxType nblks            = raft::ceildiv(D, TPB);
+  naiveStridedReductionKernel<<<nblks, TPB, 0, stream>>>(
+    dots, data, D, N, init, inplace, main_op, reduce_op, fin_op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename InType, typename OutType>
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<InType>>
 void naiveReduction(OutType* dots,
                     const InType* data,
-                    int D,
-                    int N,
+                    IdxType D,
+                    IdxType N,
                     bool rowMajor,
                     bool alongRows,
-                    cudaStream_t stream)
+                    cudaStream_t stream,
+                    OutType init,
+                    bool inplace           = false,
+                    MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                    ReduceLambda reduce_op = raft::Sum<OutType>(),
+                    FinalLambda fin_op     = raft::Nop<InType>())
 {
   if (rowMajor && alongRows) {
-    naiveCoalescedReduction(dots, data, D, N, stream);
+    naiveCoalescedReduction(dots, data, D, N, stream, init, inplace, main_op, reduce_op, fin_op);
   } else if (rowMajor && !alongRows) {
-    unaryAndGemv(dots, data, D, N, stream);
+    naiveStridedReduction(dots, data, D, N, stream, init, inplace, main_op, reduce_op, fin_op);
   } else if (!rowMajor && alongRows) {
-    unaryAndGemv(dots, data, N, D, stream);
+    naiveStridedReduction(dots, data, N, D, stream, init, inplace, main_op, reduce_op, fin_op);
   } else {
-    naiveCoalescedReduction(dots, data, N, D, stream);
+    naiveCoalescedReduction(dots, data, N, D, stream, init, inplace, main_op, reduce_op, fin_op);
   }
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 }
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 39e2764def..77ca585ea5 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -32,13 +32,13 @@ struct stridedReductionInputs {
 };
 
 template <typename T>
-void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream)
+void stridedReductionLaunch(
+  T* dots, const T* data, int cols, int rows, bool inplace, cudaStream_t stream)
 {
   raft::handle_t handle{stream};
   auto dots_view = raft::make_device_vector_view(dots, cols);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
-  strided_reduction(
-    handle, data_view, dots_view, (T)0, false, [] __device__(T in, int i) { return in * in; });
+  strided_reduction(handle, data_view, dots_view, (T)0, inplace, raft::L2Op<T, int>{});
 }
 
 template <typename T>
@@ -61,8 +61,30 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
     int len = rows * cols;
     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));  // initialize matrix to random
 
-    unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream);
-    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
+    // Perform reduction with default inplace = false first and inplace = true next
+
+    naiveStridedReduction(dots_exp.data(),
+                          data.data(),
+                          cols,
+                          rows,
+                          stream,
+                          T(0),
+                          false,
+                          raft::L2Op<T, int>{},
+                          raft::Sum<T>{},
+                          raft::Nop<T>{});
+    naiveStridedReduction(dots_exp.data(),
+                          data.data(),
+                          cols,
+                          rows,
+                          stream,
+                          T(0),
+                          true,
+                          raft::L2Op<T, int>{},
+                          raft::Sum<T>{},
+                          raft::Nop<T>{});
+    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, false, stream);
+    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, true, stream);
     handle.sync_stream(stream);
   }
 
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index 9a430e14f2..735d569318 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -20,10 +20,11 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
-#include <raft/spatial/knn/ivf_flat.cuh>
 #include <raft/spatial/knn/knn.cuh>
+#include <raft/stats/mean.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -40,9 +41,7 @@
 #include <iostream>
 #include <vector>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::neighbors::ivf_flat {
 
 template <typename IdxT>
 struct AnnIvfFlatInputs {
@@ -53,6 +52,7 @@ struct AnnIvfFlatInputs {
   IdxT nprobe;
   IdxT nlist;
   raft::distance::DistanceType metric;
+  bool adaptive_centers;
 };
 
 template <typename T, typename DataT, typename IdxT>
@@ -198,6 +198,45 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
         update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
         handle_.sync_stream(stream_);
+
+        // Test the centroid invariants
+        if (index_2.adaptive_centers()) {
+          // The centers must be up-to-date with the corresponding data
+          std::vector<uint32_t> list_sizes(index_2.n_lists());
+          std::vector<IdxT> list_offsets(index_2.n_lists());
+          rmm::device_uvector<float> centroid(ps.dim, stream_);
+          raft::copy(
+            list_sizes.data(), index_2.list_sizes().data_handle(), index_2.n_lists(), stream_);
+          raft::copy(
+            list_offsets.data(), index_2.list_offsets().data_handle(), index_2.n_lists(), stream_);
+          handle_.sync_stream(stream_);
+          for (uint32_t l = 0; l < index_2.n_lists(); l++) {
+            rmm::device_uvector<float> cluster_data(list_sizes[l] * ps.dim, stream_);
+            raft::spatial::knn::detail::utils::copy_selected<float>(
+              (IdxT)list_sizes[l],
+              (IdxT)ps.dim,
+              database.data(),
+              index_2.indices().data_handle() + list_offsets[l],
+              (IdxT)ps.dim,
+              cluster_data.data(),
+              (IdxT)ps.dim,
+              stream_);
+            raft::stats::mean<float, uint32_t>(
+              centroid.data(), cluster_data.data(), ps.dim, list_sizes[l], false, true, stream_);
+            ASSERT_TRUE(raft::devArrMatch(index_2.centers().data_handle() + ps.dim * l,
+                                          centroid.data(),
+                                          ps.dim,
+                                          raft::CompareApprox<float>(0.001),
+                                          stream_));
+          }
+        } else {
+          // The centers must be immutable
+          ASSERT_TRUE(raft::devArrMatch(index_2.centers().data_handle(),
+                                        index.centers().data_handle(),
+                                        index_2.centers().size(),
+                                        raft::Compare<float>(),
+                                        stream_));
+        }
       }
       ASSERT_TRUE(eval_neighbours(indices_naive,
                                   indices_ivfflat,
@@ -243,44 +282,44 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
 const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   // test various dims (aligned and not aligned to vector sizes)
-  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
 
   // test dims that do not fit into kernel shared memory limits
-  {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
 
   // various random combinations
-  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded},
-  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded},
-  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
-  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
-  {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded},
-
-  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct},
-  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct},
-  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
-  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct},
-  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct},
-
-  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false},
+
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
+
+  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false},
 
   // test splitting the big query batches  (> max gridDim.y) into smaller batches
-  {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
-  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct},
+  {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false},
+  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true},
 
   // test radix_sort for getting the cluster selection
   {1000,
@@ -289,14 +328,16 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    10,
    raft::spatial::knn::detail::topk::kMaxCapacity * 2,
    raft::spatial::knn::detail::topk::kMaxCapacity * 4,
-   raft::distance::DistanceType::L2Expanded},
+   raft::distance::DistanceType::L2Expanded,
+   false},
   {1000,
    10000,
    16,
    10,
    raft::spatial::knn::detail::topk::kMaxCapacity * 4,
    raft::spatial::knn::detail::topk::kMaxCapacity * 4,
-   raft::distance::DistanceType::InnerProduct}};
+   raft::distance::DistanceType::InnerProduct,
+   false}};
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
 TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }
@@ -313,6 +354,4 @@ TEST_P(AnnIVFFlatTestF_int8, AnnIVFFlat) { this->testIVFFlat(); }
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF_int8, ::testing::ValuesIn(inputs));
 
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index a247f0101f..9d6ad11ccb 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -20,10 +20,10 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ivf_pq.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/ivf_pq.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #else
 #pragma message("NN specializations are not enabled; expect very long building times.")
 #endif
@@ -35,6 +35,8 @@
 
 #include <gtest/gtest.h>
 
+#include <cub/cub.cuh>
+#include <thrust/reduce.h>
 #include <thrust/sequence.h>
 
 #include <algorithm>
@@ -42,15 +44,15 @@
 #include <iostream>
 #include <vector>
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 struct ivf_pq_inputs {
   uint32_t num_db_vecs = 4096;
   uint32_t num_queries = 1024;
   uint32_t dim         = 64;
   uint32_t k           = 32;
-  raft::spatial::knn::ivf_pq::index_params index_params;
-  raft::spatial::knn::ivf_pq::search_params search_params;
+  ivf_pq::index_params index_params;
+  ivf_pq::search_params search_params;
 
   // Set some default parameters for tests
   ivf_pq_inputs()
@@ -102,11 +104,22 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   PRINT_DIFF_V(.search_params.lut_dtype, print_dtype{p.search_params.lut_dtype});
   PRINT_DIFF_V(.search_params.internal_distance_dtype,
                print_dtype{p.search_params.internal_distance_dtype});
-  PRINT_DIFF(.search_params.preferred_thread_block_size);
   os << "}";
   return os;
 }
 
+template <typename IdxT>
+auto min_output_size(const handle_t& handle, const ivf_pq::index<IdxT>& index, uint32_t n_probes)
+  -> IdxT
+{
+  uint32_t skip = index.n_nonempty_lists() > n_probes ? index.n_nonempty_lists() - n_probes : 0;
+  auto map_type = [] __device__(uint32_t x) { return IdxT(x); };
+  using iter    = cub::TransformInputIterator<IdxT, decltype(map_type), const uint32_t*>;
+  iter start(index.list_sizes().data_handle() + skip, map_type);
+  iter end(index.list_sizes().data_handle() + index.n_nonempty_lists(), map_type);
+  return thrust::reduce(handle.get_thrust_policy(), start, end);
+}
+
 template <typename EvalT, typename DataT, typename IdxT>
 class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
  public:
@@ -190,7 +203,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   }
 
   template <typename BuildIndex>
-  auto run(BuildIndex build_index)
+  void run(BuildIndex build_index)
   {
     auto index = build_index();
 
@@ -229,6 +242,39 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 ps.k,
                                 0.001 / low_precision_factor,
                                 min_recall));
+
+    // Test a few extra invariants
+    IdxT min_results = min_output_size(handle_, index, ps.search_params.n_probes);
+    IdxT max_oob     = ps.k <= min_results ? 0 : ps.k - min_results;
+    IdxT found_oob   = 0;
+    for (uint32_t query_ix = 0; query_ix < ps.num_queries; query_ix++) {
+      for (uint32_t k = 0; k < ps.k; k++) {
+        auto flat_i   = query_ix * ps.k + k;
+        auto found_ix = indices_ivf_pq[flat_i];
+        if (found_ix == ivf_pq::index<IdxT>::kOutOfBoundsRecord) {
+          found_oob++;
+          continue;
+        }
+        ASSERT_NE(found_ix, ivf_pq::index<IdxT>::kInvalidRecord)
+          << "got an invalid record at query_ix = " << query_ix << ", k = " << k
+          << " (distance = " << distances_ivf_pq[flat_i] << ")";
+        ASSERT_LT(found_ix, ps.num_db_vecs)
+          << "got an impossible index = " << found_ix << " at query_ix = " << query_ix
+          << ", k = " << k << " (distance = " << distances_ivf_pq[flat_i] << ")";
+      }
+    }
+    ASSERT_LE(found_oob, max_oob)
+      << "got too many records out-of-bounds (see ivf_pq::index<IdxT>::kOutOfBoundsRecord).";
+    if (found_oob > 0) {
+      RAFT_LOG_WARN(
+        "Got %zu results out-of-bounds because of large top-k (%zu) and small n_probes (%u) and "
+        "small DB size/n_lists ratio (%zu / %u)",
+        size_t(found_oob),
+        size_t(ps.k),
+        ps.search_params.n_probes,
+        size_t(ps.num_db_vecs),
+        ps.index_params.n_lists);
+    }
   }
 
   void SetUp() override  // NOLINT
@@ -365,10 +411,6 @@ inline auto enum_variety() -> test_cases_t
   ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_32F; });
   ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_16F; });
 
-  ADD_CASE({ x.search_params.preferred_thread_block_size = 256; });
-  ADD_CASE({ x.search_params.preferred_thread_block_size = 512; });
-  ADD_CASE({ x.search_params.preferred_thread_block_size = 1024; });
-
   return xs;
 }
 
@@ -464,6 +506,30 @@ inline auto special_cases() -> test_cases_t
     x.search_params.n_probes     = 50;
   });
 
+  ADD_CASE({
+    x.num_db_vecs                = 10000;
+    x.dim                        = 16;
+    x.num_queries                = 500;
+    x.k                          = 128;
+    x.index_params.metric        = distance::DistanceType::L2Expanded;
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.index_params.pq_bits       = 8;
+    x.index_params.n_lists       = 100;
+    x.search_params.n_probes     = 100;
+  });
+
+  ADD_CASE({
+    x.num_db_vecs                = 10000;
+    x.dim                        = 16;
+    x.num_queries                = 500;
+    x.k                          = 129;
+    x.index_params.metric        = distance::DistanceType::L2Expanded;
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.index_params.pq_bits       = 8;
+    x.index_params.n_lists       = 100;
+    x.search_params.n_probes     = 100;
+  });
+
   return xs;
 }
 
@@ -484,4 +550,4 @@ inline auto special_cases() -> test_cases_t
 #define INSTANTIATE(type, vals) \
   INSTANTIATE_TEST_SUITE_P(IvfPq, type, ::testing::ValuesIn(vals)); /* NOLINT */
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
index 30203fd2f0..ecb2faa6a0 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
@@ -16,7 +16,7 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_f32_i64 = ivf_pq_test<float, float, int64_t>;
 
@@ -24,4 +24,4 @@ TEST_BUILD_SEARCH(f32_f32_i64)
 TEST_BUILD_EXTEND_SEARCH(f32_f32_i64)
 INSTANTIATE(f32_f32_i64, enum_variety_l2() + enum_variety_ip() + big_dims_small_lut());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
index cf2cf1ac54..57d87f47f6 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
@@ -16,11 +16,11 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_f32_u32 = ivf_pq_test<float, float, uint32_t>;
 
 TEST_BUILD_SEARCH(f32_f32_u32)
 INSTANTIATE(f32_f32_u32, defaults() + var_n_probes() + var_k() + special_cases());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
index 5321783a32..2c203d12e7 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint64_t.cu
@@ -16,11 +16,11 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_f32_u64 = ivf_pq_test<float, float, uint64_t>;
 
 TEST_BUILD_EXTEND_SEARCH(f32_f32_u64)
 INSTANTIATE(f32_f32_u64, defaults() + small_dims() + big_dims_moderate_lut());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
index e6cafb0ab4..c1029d590c 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_uint64_t.cu
@@ -16,11 +16,11 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_i08_u64 = ivf_pq_test<float, int8_t, uint64_t>;
 
 TEST_BUILD_SEARCH(f32_i08_u64)
 INSTANTIATE(f32_i08_u64, defaults() + big_dims() + var_k());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
index 23a4c87e14..729e99d22c 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_uint64_t.cu
@@ -16,7 +16,7 @@
 
 #include "../ann_ivf_pq.cuh"
 
-namespace raft::spatial::knn {
+namespace raft::neighbors::ivf_pq {
 
 using f32_u08_u64 = ivf_pq_test<float, uint8_t, uint64_t>;
 
@@ -24,4 +24,4 @@ TEST_BUILD_SEARCH(f32_u08_u64)
 TEST_BUILD_EXTEND_SEARCH(f32_u08_u64)
 INSTANTIATE(f32_u08_u64, small_dims_per_cluster() + enum_variety());
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index faf6fad115..07ef410d36 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -25,7 +25,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn {
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+
+namespace raft::neighbors {
 
 struct print_dtype {
   cudaDataType_t value;
@@ -169,16 +172,17 @@ void naiveBfKnn(EvalT* dist_topk,
     naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
       dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
 
-    detail::select_topk<EvalT, IdxT>(dist.data(),
-                                     nullptr,
-                                     batch_size,
-                                     input_len,
-                                     static_cast<int>(k),
-                                     dist_topk + offset * k,
-                                     indices_topk + offset * k,
-                                     type != raft::distance::DistanceType::InnerProduct,
-                                     stream,
-                                     mr);
+    spatial::knn::detail::select_topk<EvalT, IdxT>(
+      dist.data(),
+      nullptr,
+      batch_size,
+      input_len,
+      static_cast<int>(k),
+      dist_topk + offset * k,
+      indices_topk + offset * k,
+      type != raft::distance::DistanceType::InnerProduct,
+      stream,
+      mr);
   }
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
@@ -244,4 +248,4 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
   return testing::AssertionSuccess();
 }
 
-}  // namespace raft::spatial::knn
+}  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
new file mode 100644
index 0000000000..e1700e44b3
--- /dev/null
+++ b/cpp/test/neighbors/refine.cu
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "ann_utils.cuh"
+
+#include "refine_helper.cuh"
+
+#include <raft/core/handle.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/detail/refine.cuh>
+#include <raft/neighbors/refine.cuh>
+#include <raft/spatial/knn/ann.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <gtest/gtest.h>
+
+#if defined RAFT_NN_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
+#include <vector>
+
+namespace raft::neighbors {
+
+template <typename DataT, typename DistanceT, typename IdxT>
+class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
+ public:
+  RefineTest()
+    : stream_(handle_.get_stream()),
+      data(handle_, ::testing::TestWithParam<detail::RefineInputs<IdxT>>::GetParam())
+  {
+  }
+
+ protected:
+ public:  // tamas remove
+  void testRefine()
+  {
+    std::vector<IdxT> indices(data.p.n_queries * data.p.k);
+    std::vector<DistanceT> distances(data.p.n_queries * data.p.k);
+
+    if (data.p.host_data) {
+      raft::neighbors::refine<IdxT, DataT, DistanceT, IdxT>(handle_,
+                                                            data.dataset_host.view(),
+                                                            data.queries_host.view(),
+                                                            data.candidates_host.view(),
+                                                            data.refined_indices_host.view(),
+                                                            data.refined_distances_host.view(),
+                                                            data.p.metric);
+      raft::copy(indices.data(),
+                 data.refined_indices_host.data_handle(),
+                 data.refined_indices_host.size(),
+                 stream_);
+      raft::copy(distances.data(),
+                 data.refined_distances_host.data_handle(),
+                 data.refined_distances_host.size(),
+                 stream_);
+
+    } else {
+      raft::neighbors::refine<IdxT, DataT, DistanceT, IdxT>(handle_,
+                                                            data.dataset.view(),
+                                                            data.queries.view(),
+                                                            data.candidates.view(),
+                                                            data.refined_indices.view(),
+                                                            data.refined_distances.view(),
+                                                            data.p.metric);
+      update_host(distances.data(),
+                  data.refined_distances.data_handle(),
+                  data.refined_distances.size(),
+                  stream_);
+      update_host(
+        indices.data(), data.refined_indices.data_handle(), data.refined_indices.size(), stream_);
+    }
+    handle_.sync_stream(stream_);
+
+    double min_recall = 1;
+
+    ASSERT_TRUE(raft::neighbors::eval_neighbours(data.true_refined_indices_host,
+                                                 indices,
+                                                 data.true_refined_distances_host,
+                                                 distances,
+                                                 data.p.n_queries,
+                                                 data.p.k,
+                                                 0.001,
+                                                 min_recall));
+  }
+
+ public:
+  raft::handle_t handle_;
+  rmm::cuda_stream_view stream_;
+  detail::RefineHelper<DataT, DistanceT, IdxT> data;
+};
+
+const std::vector<detail::RefineInputs<int64_t>> inputs =
+  raft::util::itertools::product<detail::RefineInputs<int64_t>>(
+    {137},
+    {1000},
+    {16},
+    {1, 10, 33},
+    {33},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
+    {false, true});
+
+typedef RefineTest<float, float, std::int64_t> RefineTestF;
+TEST_P(RefineTestF, AnnRefine) { this->testRefine(); }
+
+INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF, ::testing::ValuesIn(inputs));
+
+typedef RefineTest<uint8_t, float, std::int64_t> RefineTestF_uint8;
+TEST_P(RefineTestF_uint8, AnnRefine) { this->testRefine(); }
+INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF_uint8, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/refine_helper.cuh b/cpp/test/neighbors/refine_helper.cuh
new file mode 100644
index 0000000000..3c69a8f5b7
--- /dev/null
+++ b/cpp/test/neighbors/refine_helper.cuh
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_utils.cuh"
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+namespace raft::neighbors::detail {
+
+template <typename IdxT>
+struct RefineInputs {
+  IdxT n_queries;
+  IdxT n_rows;
+  IdxT dim;
+  IdxT k;   // after refinement
+  IdxT k0;  // initial k before refinement (k0 >= k).
+  raft::distance::DistanceType metric;
+  bool host_data;
+};
+
+/** Helper class to allocate arrays and generate input data for refinement test and benchmark. */
+template <typename DataT, typename DistanceT, typename IdxT>
+class RefineHelper {
+ public:
+  RefineHelper(const raft::handle_t& handle, RefineInputs<IdxT> params)
+    : handle_(handle), stream_(handle.get_stream()), p(params)
+  {
+    raft::random::Rng r(1234ULL);
+
+    dataset = raft::make_device_matrix<DataT, IdxT>(handle_, p.n_rows, p.dim);
+    queries = raft::make_device_matrix<DataT, IdxT>(handle_, p.n_queries, p.dim);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.uniform(dataset.data_handle(), dataset.size(), DataT(-10.0), DataT(10.0), stream_);
+      r.uniform(queries.data_handle(), queries.size(), DataT(-10.0), DataT(10.0), stream_);
+    } else {
+      r.uniformInt(dataset.data_handle(), dataset.size(), DataT(1), DataT(20), stream_);
+      r.uniformInt(queries.data_handle(), queries.size(), DataT(1), DataT(20), stream_);
+    }
+
+    refined_distances = raft::make_device_matrix<DistanceT, IdxT>(handle_, p.n_queries, p.k);
+    refined_indices   = raft::make_device_matrix<IdxT, IdxT>(handle_, p.n_queries, p.k);
+
+    // Generate candidate vectors
+    {
+      candidates = raft::make_device_matrix<IdxT, IdxT>(handle_, p.n_queries, p.k0);
+      rmm::device_uvector<DistanceT> distances_tmp(p.n_queries * p.k0, stream_);
+      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_tmp.data(),
+                                                          candidates.data_handle(),
+                                                          queries.data_handle(),
+                                                          dataset.data_handle(),
+                                                          p.n_queries,
+                                                          p.n_rows,
+                                                          p.dim,
+                                                          p.k0,
+                                                          p.metric,
+                                                          stream_);
+      handle_.sync_stream(stream_);
+    }
+
+    if (p.host_data) {
+      dataset_host    = raft::make_host_matrix<DataT, IdxT>(p.n_rows, p.dim);
+      queries_host    = raft::make_host_matrix<DataT, IdxT>(p.n_queries, p.dim);
+      candidates_host = raft::make_host_matrix<IdxT, IdxT>(p.n_queries, p.k0);
+
+      raft::copy(dataset_host.data_handle(), dataset.data_handle(), dataset.size(), stream_);
+      raft::copy(queries_host.data_handle(), queries.data_handle(), queries.size(), stream_);
+      raft::copy(
+        candidates_host.data_handle(), candidates.data_handle(), candidates.size(), stream_);
+
+      refined_distances_host = raft::make_host_matrix<DistanceT, IdxT>(p.n_queries, p.k);
+      refined_indices_host   = raft::make_host_matrix<IdxT, IdxT>(p.n_queries, p.k);
+      handle_.sync_stream(stream_);
+    }
+
+    // Generate ground thruth for testing.
+    {
+      rmm::device_uvector<DistanceT> distances_dev(p.n_queries * p.k, stream_);
+      rmm::device_uvector<IdxT> indices_dev(p.n_queries * p.k, stream_);
+      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_dev.data(),
+                                                          indices_dev.data(),
+                                                          queries.data_handle(),
+                                                          dataset.data_handle(),
+                                                          p.n_queries,
+                                                          p.n_rows,
+                                                          p.dim,
+                                                          p.k,
+                                                          p.metric,
+                                                          stream_);
+      true_refined_distances_host.resize(p.n_queries * p.k);
+      true_refined_indices_host.resize(p.n_queries * p.k);
+      raft::copy(true_refined_indices_host.data(), indices_dev.data(), indices_dev.size(), stream_);
+      raft::copy(
+        true_refined_distances_host.data(), distances_dev.data(), distances_dev.size(), stream_);
+      handle_.sync_stream(stream_);
+    }
+  }
+
+ public:
+  RefineInputs<IdxT> p;
+  const raft::handle_t& handle_;
+  rmm::cuda_stream_view stream_;
+
+  raft::device_matrix<DataT, IdxT, row_major> dataset;
+  raft::device_matrix<DataT, IdxT, row_major> queries;
+  raft::device_matrix<IdxT, IdxT, row_major> candidates;  // Neighbor candidate indices
+  raft::device_matrix<IdxT, IdxT, row_major> refined_indices;
+  raft::device_matrix<DistanceT, IdxT, row_major> refined_distances;
+
+  raft::host_matrix<DataT, IdxT, row_major> dataset_host;
+  raft::host_matrix<DataT, IdxT, row_major> queries_host;
+  raft::host_matrix<IdxT, IdxT, row_major> candidates_host;
+  raft::host_matrix<IdxT, IdxT, row_major> refined_indices_host;
+  raft::host_matrix<DistanceT, IdxT, row_major> refined_distances_host;
+
+  std::vector<IdxT> true_refined_indices_host;
+  std::vector<DistanceT> true_refined_distances_host;
+};
+}  // namespace raft::neighbors::detail
\ No newline at end of file
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu
index c76064ade7..9e9bd80673 100644
--- a/cpp/test/pow2_utils.cu
+++ b/cpp/test/pow2_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/pow2_utils.cuh>
+#include <raft/util/pow2_utils.cuh>
 
 namespace raft {
 
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index caf982d4ed..51a79ae04a 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -138,7 +138,7 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
     raft::update_device(P_d.data(), P.data(), dim * dim, stream);
     raft::update_device(x_d.data(), x.data(), dim, stream);
 
-    // initilizing the mvg
+    // initializing the mvg
     mvg           = new multi_variable_gaussian<T>(handle, dim, method);
     std::size_t o = mvg->get_workspace_size();
 
@@ -212,11 +212,11 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
   static auto old_enum_to_new_enum(typename multi_variable_gaussian<T>::Decomposer method)
   {
     if (method == multi_variable_gaussian<T>::chol_decomp) {
-      return detail::multi_variable_gaussian_decomposition_method::CHOLESKY;
+      return multi_variable_gaussian_decomposition_method::CHOLESKY;
     } else if (method == multi_variable_gaussian<T>::jacobi) {
-      return detail::multi_variable_gaussian_decomposition_method::JACOBI;
+      return multi_variable_gaussian_decomposition_method::JACOBI;
     } else {
-      return detail::multi_variable_gaussian_decomposition_method::QR;
+      return multi_variable_gaussian_decomposition_method::QR;
     }
   }
 
diff --git a/cpp/test/span.cu b/cpp/test/span.cu
index a6aed0896c..e9af9b857f 100644
--- a/cpp/test/span.cu
+++ b/cpp/test/span.cu
@@ -131,7 +131,7 @@ struct TestEqual {
 
 TEST(GPUSpan, WithTrust)
 {
-  // Not adviced to initialize span with host_vector, since h_vec.data() is
+  // Not advised to initialize span with host_vector, since h_vec.data() is
   // a host function.
   thrust::host_vector<float> h_vec(16);
   std::iota(h_vec.begin(), h_vec.end(), 0);
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index bea8f903cd..108d38a8b4 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -29,7 +29,6 @@ namespace raft {
 namespace sparse {
 
 using namespace raft;
-using namespace raft::sparse;
 
 template <typename value_idx, typename value_t>
 struct CSRTransposeInputs {
diff --git a/cpp/test/sparse/spgemmi.cu b/cpp/test/sparse/spgemmi.cu
new file mode 100644
index 0000000000..a132c94fde
--- /dev/null
+++ b/cpp/test/sparse/spgemmi.cu
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.h"
+
+#include <raft/core/handle.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+struct SPGemmiInputs {
+  int n_rows, n_cols;
+};
+
+template <typename data_t>
+class SPGemmiTest : public ::testing::TestWithParam<SPGemmiInputs> {
+ public:
+  SPGemmiTest()
+    : params(::testing::TestWithParam<SPGemmiInputs>::GetParam()), stream(handle.get_stream())
+  {
+  }
+
+ protected:
+  void SetUp() override {}
+
+  void Run()
+  {
+    // Host problem definition
+    float alpha    = 1.0f;
+    float beta     = 0.0f;
+    int A_num_rows = 5;
+    int A_num_cols = 3;
+    // int   B_num_rows      = A_num_cols;
+    int B_num_cols      = 4;
+    int B_nnz           = 9;
+    int lda             = A_num_rows;
+    int ldc             = A_num_rows;
+    int A_size          = lda * A_num_cols;
+    int C_size          = ldc * B_num_cols;
+    int hB_cscOffsets[] = {0, 3, 4, 7, 9};
+    int hB_rows[]       = {0, 2, 3, 1, 0, 2, 3, 1, 3};
+    float hB_values[]   = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+    float hA[]          = {1.0f,
+                  2.0f,
+                  3.0f,
+                  4.0f,
+                  5.0f,
+                  6.0f,
+                  7.0f,
+                  8.0f,
+                  9.0f,
+                  10.0f,
+                  11.0f,
+                  12.0f,
+                  13.0f,
+                  14.0f,
+                  15.0f};
+    std::vector<float> hC(C_size);
+    std::vector<float> hC_expected{23, 26, 29, 32,  35,  24, 28, 32, 36, 40,
+                                   71, 82, 93, 104, 115, 48, 56, 64, 72, 80};
+    //--------------------------------------------------------------------------
+    // Device memory management
+    rmm::device_uvector<int> dB_cscOffsets(B_num_cols + 1, stream);
+    rmm::device_uvector<int> dB_rows(B_nnz, stream);
+    rmm::device_uvector<float> dB_values(B_nnz, stream);
+    rmm::device_uvector<float> dA(A_size, stream);
+    rmm::device_uvector<float> dC(C_size, stream);
+    rmm::device_uvector<float> dCT(C_size, stream);
+
+    raft::update_device(dB_cscOffsets.data(), hB_cscOffsets, B_num_cols + 1, stream);
+    raft::update_device(dB_rows.data(), hB_rows, B_nnz, stream);
+    raft::update_device(dB_values.data(), hB_values, B_nnz, stream);
+    raft::update_device(dA.data(), hA, A_size, stream);
+    raft::update_device(dC.data(), hC.data(), C_size, stream);
+
+    //--------------------------------------------------------------------------
+    // execute gemmi
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsegemmi(handle.get_cusparse_handle(),
+                                                          A_num_rows,
+                                                          B_num_cols,
+                                                          A_num_cols,
+                                                          B_nnz,
+                                                          &alpha,
+                                                          dA.data(),
+                                                          lda,
+                                                          dB_values.data(),
+                                                          dB_cscOffsets.data(),
+                                                          dB_rows.data(),
+                                                          &beta,
+                                                          dC.data(),
+                                                          ldc,
+                                                          handle.get_stream()));
+
+    //--------------------------------------------------------------------------
+    // result check
+    raft::update_host(hC.data(), dC.data(), C_size, stream);
+    ASSERT_TRUE(hostVecMatch(hC_expected, hC, raft::Compare<float>()));
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  SPGemmiInputs params;
+};
+
+using SPGemmiTestF = SPGemmiTest<float>;
+TEST_P(SPGemmiTestF, Result) { Run(); }
+
+using SPGemmiTestD = SPGemmiTest<double>;
+TEST_P(SPGemmiTestD, Result) { Run(); }
+
+const std::vector<SPGemmiInputs> csc_inputs_f = {{5, 4}};
+const std::vector<SPGemmiInputs> csc_inputs_d = {{5, 4}};
+
+INSTANTIATE_TEST_CASE_P(SparseGemmi, SPGemmiTestF, ::testing::ValuesIn(csc_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseGemmi, SPGemmiTestD, ::testing::ValuesIn(csc_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 14319b85e1..26483e6b2d 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -18,13 +18,18 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
+#include <raft/core/kvp.hpp>
+#include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/for_each.h>
 
 #include <fstream>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -42,7 +47,7 @@ struct CompareApprox {
   {
     T diff  = abs(a - b);
     T m     = std::max(abs(a), abs(b));
-    T ratio = diff >= eps ? diff / m : diff;
+    T ratio = diff > eps ? diff / m : diff;
 
     return (ratio <= eps);
   }
@@ -51,6 +56,30 @@ struct CompareApprox {
   T eps;
 };
 
+template <typename Key, typename Value>
+::std::ostream& operator<<(::std::ostream& os, const raft::KeyValuePair<Key, Value>& kv)
+{
+  os << "{ " << kv.key << ", " << kv.value << '}';
+  return os;
+}
+
+template <typename Key, typename Value>
+struct CompareApprox<raft::KeyValuePair<Key, Value>> {
+  CompareApprox(raft::KeyValuePair<Key, Value> eps)
+    : compare_keys(eps.key), compare_values(eps.value)
+  {
+  }
+  bool operator()(const raft::KeyValuePair<Key, Value>& a,
+                  const raft::KeyValuePair<Key, Value>& b) const
+  {
+    return compare_keys(a.key, b.key) && compare_values(a.value, b.value);
+  }
+
+ private:
+  CompareApprox<Key> compare_keys;
+  CompareApprox<Value> compare_values;
+};
+
 template <typename T>
 struct CompareApproxAbs {
   CompareApproxAbs(T eps_) : eps(eps_) {}
@@ -280,6 +309,52 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare)
   return testing::AssertionSuccess();
 }
 
+template <typename T, typename IdxT>
+typename std::enable_if_t<std::is_floating_point_v<T>> gen_uniform(T* out,
+                                                                   raft::random::RngState& rng,
+                                                                   IdxT len,
+                                                                   cudaStream_t stream,
+                                                                   T range_min = T(-1),
+                                                                   T range_max = T(1))
+{
+  raft::random::uniform(rng, out, len, range_min, range_max, stream);
+}
+
+template <typename T, typename IdxT>
+typename std::enable_if_t<std::is_integral_v<T>> gen_uniform(T* out,
+                                                             raft::random::RngState& rng,
+                                                             IdxT len,
+                                                             cudaStream_t stream,
+                                                             T range_min = T(0),
+                                                             T range_max = T(100))
+{
+  raft::random::uniformInt(rng, out, len, range_min, range_max, stream);
+}
+
+template <typename T1, typename T2, typename IdxT>
+void gen_uniform(raft::KeyValuePair<T1, T2>* out,
+                 raft::random::RngState& rng,
+                 IdxT len,
+                 cudaStream_t stream)
+{
+  rmm::device_uvector<T1> keys(len, stream);
+  rmm::device_uvector<T2> values(len, stream);
+
+  gen_uniform(keys.data(), rng, len, stream);
+  gen_uniform(values.data(), rng, len, stream);
+
+  const T1* d_keys   = keys.data();
+  const T2* d_values = values.data();
+  auto counting      = thrust::make_counting_iterator<IdxT>(0);
+  thrust::for_each(rmm::exec_policy(stream),
+                   counting,
+                   counting + len,
+                   [out, d_keys, d_values] __device__(int idx) {
+                     out[idx].key   = d_keys[idx];
+                     out[idx].value = d_values[idx];
+                   });
+}
+
 /** @} */
 
 /** time the function call 'func' using cuda events */
diff --git a/docs/source/_static/copybutton.css b/docs/source/_static/copybutton.css
deleted file mode 100644
index 5eef6e366d..0000000000
--- a/docs/source/_static/copybutton.css
+++ /dev/null
@@ -1,42 +0,0 @@
-/* This contains code with copyright by the scikit-learn project, subject to
-the license in /thirdparty/LICENSES/LICENSE.scikit_learn */
-
-/* copybutton */
-/* Adds "Show/Hide Output" button to Examples */
-
-.copybutton {
-  cursor: pointer;
-  position: absolute;
-  top: 0px;
-  right: 0px;
-  border: 1px solid rgb(221, 221, 221);
-  color: rgb(221, 221, 221);
-  font-family: monospace;
-  padding-left: 0.2rem;
-  padding-right: 0.2rem;
-}
-
-div.highlight:hover span.copybutton::after {
-  background: #3F556B;
-  border-radius: 0.25rem;
-  color: white;
-  content: attr(title);
-  padding: 0.25rem;
-  position: absolute;
-  z-index: 98;
-  width: 100px;
-  font-size: 0.7rem;
-  top: 0;
-  right: 0;
-}
-
-/* copy buttonn */
-div.highlight:hover span.copybutton {
-  background-color: #3F556B;
-  color: white;
-}
-
-div.highlight:hover span.copybutton:hover {
-  background-color: #20252B;
-}
-
diff --git a/docs/source/_static/example_mod.js b/docs/source/_static/example_mod.js
deleted file mode 100644
index 77dc618a82..0000000000
--- a/docs/source/_static/example_mod.js
+++ /dev/null
@@ -1,61 +0,0 @@
-// This contains code with copyright by the scikit-learn project, subject to
-// the license in /thirdparty/LICENSES/LICENSE.scikit_learn
-
-$(document).ready(function () {
-   /* Add a [>>>] button on the top-right corner of code samples to hide
-    * the >>> and ... prompts and the output and thus make the code
-    * copyable. */
-   var div = $('.highlight-python .highlight,' +
-      '.highlight-python3 .highlight,' +
-      '.highlight-pycon .highlight,' +
-      '.highlight-default .highlight')
-   var pre = div.find('pre');
-
-   // get the styles from the current theme
-   pre.parent().parent().css('position', 'relative');
-   var hide_text = 'Hide prompts and outputs';
-   var show_text = 'Show prompts and outputs';
-
-   // create and add the button to all the code blocks that contain >>>
-   div.each(function (index) {
-      var jthis = $(this);
-      if (jthis.find('.gp').length > 0) {
-         var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
-         button.attr('title', hide_text);
-         button.data('hidden', 'false');
-         jthis.prepend(button);
-      }
-      // tracebacks (.gt) contain bare text elements that need to be
-      // wrapped in a span to work with .nextUntil() (see later)
-      jthis.find('pre:has(.gt)').contents().filter(function () {
-         return ((this.nodeType == 3) && (this.data.trim().length > 0));
-      }).wrap('<span>');
-   });
-
-   // define the behavior of the button when it's clicked
-   $('.copybutton').click(function (e) {
-      e.preventDefault();
-      var button = $(this);
-      if (button.data('hidden') === 'false') {
-         // hide the code output
-         button.parent().find('.go, .gp, .gt').hide();
-         button.next('pre')
-            .find('.gt')
-            .nextUntil('.gp, .go')
-            .css('visibility', 'hidden');
-         button.css('text-decoration', 'line-through');
-         button.attr('title', show_text);
-         button.data('hidden', 'true');
-      } else {
-         // show the code output
-         button.parent().find('.go, .gp, .gt').show();
-         button.next('pre')
-            .find('.gt')
-            .nextUntil('.gp, .go')
-            .css('visibility', 'visible');
-         button.css('text-decoration', 'none');
-         button.attr('title', hide_text);
-         button.data('hidden', 'false');
-      }
-   });
-});
\ No newline at end of file
diff --git a/docs/source/_static/infoboxes.css b/docs/source/_static/infoboxes.css
deleted file mode 100644
index 4cc597bd28..0000000000
--- a/docs/source/_static/infoboxes.css
+++ /dev/null
@@ -1,87 +0,0 @@
-/* This contains code with copyright by the scikit-learn project, subject to
-the license in /thirdparty/LICENSES/LICENSE.scikit_learn */
-
-/* info boxes */
-
-div.topic {
-  padding: 0.5rem;
-  background-color: #eee;
-  margin-bottom: 1rem;
-  border-radius: 0.25rem;
-  border: 1px solid #CCC;
-}
-
-div.topic p {
-  margin-bottom: 0.25rem;
-}
-
-div.topic dd {
-  margin-bottom: 0.25rem;
-}
-
-p.topic-title {
-  font-weight: bold;
-  margin-bottom: 0.5rem;
-}
-
-div.topic > ul.simple {
-  margin-bottom: 0.25rem;
-}
-
-p.admonition-title {
-  margin-right: 0.5rem;
-  font-weight: bold;
-  display: inline;
-}
-
-p.admonition-title:after {
-  content: ":";
-}
-
-div.admonition p.admonition-title + p, div.deprecated p {
-  display: inline;
-}
-
-div.admonition, div.deprecated {
-  padding: 0.5rem;
-  border-radius: 0.5rem;
-  border: 1px solid #ddd;
-  margin-bottom: 1rem;
-}
-
-div.admonition {
-  background-color: #eee;
-}
-
-div.admonition p, div.admonition dl, div.admonition dd {
-  margin-bottom: 0
-}
-
-div.deprecated {
-  color: #b94a48;
-  background-color: #F3E5E5;
-  border: 1px solid #eed3d7;
-}
-
-div.seealso {
-  background-color: #FFFBE8;
-  border: 1px solid #fbeed5;
-  color: #AF8A4B;
-}
-
-div.versionchanged {
-  margin-top: 0.5rem;
-  padding: 0.5rem;
-  background-color: #FFFBE8;
-  border: 1px solid #fbeed5;
-  border-radius: 0.5rem;
-}
-
-div.versionchanged p {
-  margin-bottom: 0;
-}
-
-dt.label {
-  float: left;
-  padding-right: 0.5rem;
-}
\ No newline at end of file
diff --git a/docs/source/build.md b/docs/source/build.md
index 2a093fcc22..4e692f85c5 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -19,15 +19,16 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth
 
 #### Optional
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API.
-- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
-- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::neighbors` API..
+- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0 - Used by cuCollections
+- [CUTLASS](https://github.com/NVIDIA/cutlass)  v2.9.1 - Used in `raft::distance` API.
+- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::neighbors` API.
 - [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `raft-dask`.
 - [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `raft-dask`.
 - [Googletest](https://github.com/google/googletest) - Needed to build tests
 - [Googlebench](https://github.com/google/benchmark) - Needed to build benchmarks
 - [Doxygen](https://github.com/doxygen/doxygen) - Needed to build docs
 
-All of RAFT's C++ APIs can be used header-only but pre-compiled shared libraries also contain some host-accessable APIs and template instantiations to accelerate compile times.
+All of RAFT's C++ APIs can be used header-only but pre-compiled shared libraries also contain some host-accessible APIs and template instantiations to accelerate compile times.
 
 The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the headers, tests, benchmarks, and individual shared libraries.
 
@@ -35,9 +36,14 @@ The recommended way to build and install RAFT is to use the `build.sh` script in
 
 `build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
 
-The following example will download the needed dependencies and install the RAFT headers into `$INSTALL_PREFIX/include/raft`. The `--install` flag can be omitted to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build, with the exception of building FAISS statically into the shared libraries.
+The following example will download the needed dependencies and install the RAFT headers into `$INSTALL_PREFIX/include/raft`. 
 ```bash
-./build.sh libraft --install
+./build.sh libraft
+
+```
+The `-n` flag can be passed to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build, with the exception of building FAISS statically into the shared libraries.
+```bash
+./build.sh libraft -n
 ```
 
 ### C++ Shared Libraries (optional)
@@ -52,7 +58,7 @@ Individual shared libraries have their own flags and multiple can be used (thoug
 ./build.sh libraft --compile-nn --compile-dist
 ```
 
-Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`.
+In above example the shared libraries are installed by default into `$INSTALL_PREFIX/lib`. To disable this, pass `-n` flag.
 
 ### ccache and sccache
 
@@ -147,9 +153,9 @@ The Python APIs can be built and installed using the `build.sh` script:
 
 ```bash
 # to build pylibraft
-./build.sh libraft pylibraft --install --compile-libs
+./build.sh libraft pylibraft --compile-libs
 # to build raft-dask
-./build.sh libraft raft-dask --install --compile-libs
+./build.sh libraft raft-dask --compile-libs
 ```
 
 `setup.py` can also be used to build the Python APIs manually:
@@ -180,7 +186,7 @@ The documentation requires that the C++ headers and python packages have been bu
 The following will build the docs along with the C++ and Python packages:
 
 ```
-./build.sh libraft pylibraft raft-dask docs --compile-libs --install
+./build.sh libraft pylibraft raft-dask docs --compile-libs
 ```
 
 
@@ -231,7 +237,7 @@ If RAFT has already been installed, such as by using the `build.sh` script, use
 
 Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
-The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.hpp` and located in the base directory for the packages that contain specializations.
+The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.cuh` and located in the base directory for the packages that contain specializations.
 
 The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
 ```c++
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a27254433e..4f78ae2145 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -51,6 +51,7 @@
     "breathe",
     "recommonmark",
     "sphinx_markdown_tables",
+    "sphinx_copybutton"
 ]
 
 breathe_default_project = "RAFT"
@@ -112,32 +113,27 @@
 # a list of builtin themes.
 #
 
-html_theme = "sphinx_rtd_theme"
+html_theme = "pydata_sphinx_theme"
 
-# on_rtd is whether we are on readthedocs.org
-on_rtd = os.environ.get("READTHEDOCS", None) == "True"
-
-if not on_rtd:
-    # only import and set the theme if we're building docs locally
-    # otherwise, readthedocs.org uses their theme by default,
-    # so no need to specify it
-    import sphinx_rtd_theme
-
-    html_theme = "sphinx_rtd_theme"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    "external_links": [],
+    "github_url": "https://github.com/rapidsai/raft",
+    "twitter_url": "https://twitter.com/rapidsai",
+    "show_toc_level": 1,
+    "navbar_align": "right",
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
-html_js_files = ["example_mod.js"]
+html_js_files = []
 
 # -- Options for HTMLHelp output ------------------------------------------
 
@@ -203,8 +199,6 @@
 
 
 def setup(app):
-    app.add_css_file("copybutton.css")
-    app.add_css_file("infoboxes.css")
     app.add_css_file("references.css")
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file(
diff --git a/CONTRIBUTING.md b/docs/source/contributing.md
similarity index 72%
rename from CONTRIBUTING.md
rename to docs/source/contributing.md
index faf777ba42..1b4071d0a5 100755
--- a/CONTRIBUTING.md
+++ b/docs/source/contributing.md
@@ -1,4 +1,4 @@
-# Contributing to RAFT
+# Contributing
 
 If you are interested in contributing to RAFT, your contributions will fall
 into three categories:
@@ -37,6 +37,43 @@ into three categories:
 Remember, if you are unsure about anything, don't hesitate to comment on issues
 and ask for clarifications!
 
+
+### Python / Pre-commit hooks
+
+RAFT uses [pre-commit](https://pre-commit.com/) to execute code linters and formatters such as
+[Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), and
+[flake8](https://flake8.pycqa.org/en/latest/). These tools ensure a consistent code format
+throughout the project. Using pre-commit ensures that linter versions and options are aligned for
+all developers. Additionally, there is a CI check in place to enforce that committed code follows
+our standards.
+
+To use `pre-commit`, install via `conda` or `pip`:
+
+```bash
+conda install -c conda-forge pre-commit
+```
+
+```bash
+pip install pre-commit
+```
+
+Then run pre-commit hooks before committing code:
+
+```bash
+pre-commit run
+```
+
+Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
+
+```bash
+pre-commit install
+```
+
+Now code linters and formatters will be run each time you commit changes.
+
+You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`.
+
+
 ### Seasoned developers
 
 Once you have gotten your feet wet and are more comfortable with the code, you
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index cf3829422d..569fd64061 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -2,8 +2,6 @@
 C++ API Reference
 ~~~~~~~~~~~~~~~~~
 
-
-
 .. _api:
 
 .. toctree::
diff --git a/docs/source/cpp_api/cluster.rst b/docs/source/cpp_api/cluster.rst
index 90c430ace9..6fdc1c696f 100644
--- a/docs/source/cpp_api/cluster.rst
+++ b/docs/source/cpp_api/cluster.rst
@@ -4,25 +4,38 @@ Cluster
 This page provides C++ class references for the publicly-exposed elements of the `raft/cluster` headers. RAFT provides
 fundamental clustering algorithms which are, themselves, considered reusable building blocks for other algorithms.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
 K-Means
--------
+#######
+
+Header: `raft/cluster/kmeans.cuh`
 
 .. doxygennamespace:: raft::cluster::kmeans
     :project: RAFT
     :members:
+    :content-only:
 
 
 Hierarchical Clustering
------------------------
+#######################
+
+Header: `raft/cluster/single_linkage.cuh`
 
 .. doxygennamespace:: raft::cluster::hierarchy
     :project: RAFT
     :members:
+    :content-only:
 
 
 Spectral Clustering
--------------------
+###################
+
+Header: `raft/spectral/partition.cuh`
 
 .. doxygennamespace:: raft::spectral
     :project: RAFT
     :members:
+    :content-only:
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 9e4ef412f7..68965053de 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -6,9 +6,16 @@ require minimal dependencies, can be compiled without `nvcc`, and thus are safe
 the headers in the `raft/core` include directory, any headers in the codebase with the suffix `_types.hpp` are also safe to
 expose in public APIs.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
 handle_t
 ########
 
+Header: `raft/core/handle.hpp`
+
 .. doxygenclass:: raft::handle_t
     :project: RAFT
     :members:
@@ -17,6 +24,8 @@ handle_t
 Interruptible
 #############
 
+Header: `raft/core/interupptible.hpp`
+
 .. doxygenclass:: raft::interruptible
     :project: RAFT
     :members:
@@ -24,14 +33,19 @@ Interruptible
 NVTX
 ####
 
+Header: `raft/core/nvtx.hpp`
+
 .. doxygennamespace:: raft::common::nvtx
     :project: RAFT
     :members:
+    :content-only:
 
 
 Key-Value Pair
 ##############
 
+Header: `raft/core/kvp.hpp`
+
 .. doxygenstruct:: raft::KeyValuePair
     :project: RAFT
     :members:
@@ -40,6 +54,8 @@ Key-Value Pair
 logger
 ######
 
+Header: `raft/core/logger.hpp`
+
 .. doxygenclass:: raft::logger
     :project: RAFT
     :members:
@@ -48,7 +64,10 @@ logger
 Multi-node Multi-GPU
 ####################
 
+Header: `raft/core/comms.hpp`
+
 .. doxygennamespace:: raft::comms
     :project: RAFT
     :members:
+    :content-only:
 
diff --git a/docs/source/cpp_api/distance.rst b/docs/source/cpp_api/distance.rst
index 2596361f6a..e77e311cdc 100644
--- a/docs/source/cpp_api/distance.rst
+++ b/docs/source/cpp_api/distance.rst
@@ -4,8 +4,16 @@ Distance
 This page provides C++ class references for the publicly-exposed elements of the `raft/distance` package. RAFT's
 distances have been highly optimized and support a wide assortment of different distance measures.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
 Distance
 ########
 
+Header: `raft/distance/distance.cuh`
+
 .. doxygennamespace:: raft::distance
     :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index 5664e5b3dc..081eb40298 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -6,6 +6,12 @@ In addition to providing highly optimized arithmetic and matrix/vector operation
 by providing common BLAS routines, standard linear system solvers, factorization and eigenvalue solvers. Some of these routines
 hide the complexities of lower-level C-based libraries provided in the CUDA toolkit 
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
 .. doxygennamespace:: raft::linalg
     :project: RAFT
     :members:
+    :content-only:
diff --git a/docs/source/cpp_api/matrix.rst b/docs/source/cpp_api/matrix.rst
index 945658eb7b..b032281a1c 100644
--- a/docs/source/cpp_api/matrix.rst
+++ b/docs/source/cpp_api/matrix.rst
@@ -4,6 +4,12 @@ Matrix
 This page provides C++ class references for the publicly-exposed elements of the `raft/matrix` headers. The `raft/matrix`
 headers cover many operations on matrices that are otherwise not covered by `raft/linalg`.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
 .. doxygennamespace:: raft::matrix
     :project: RAFT
     :members:
+    :content-only:
diff --git a/docs/source/cpp_api/mdspan.rst b/docs/source/cpp_api/mdspan.rst
index a283da967b..511ead8573 100644
--- a/docs/source/cpp_api/mdspan.rst
+++ b/docs/source/cpp_api/mdspan.rst
@@ -1,23 +1,19 @@
-Multi-dimensional Span / Array
-==============================
+Multi-dimensional Data
+======================
 
 This page provides C++ class references for the RAFT's 1d span and multi-dimension owning (mdarray) and non-owning (mdspan) APIs. These headers can be found in the `raft/core` directory.
 
-Representation
-##############
+.. role:: py(code)
+   :language: c++
+   :class: highlight
 
-.. doxygenstruct:: raft::host_device_accessor
-    :project: RAFT
-    :members:
 
-.. doxygentypedef:: raft::host_accessor
-    :project: RAFT
+Representation
+##############
 
-.. doxygentypedef:: raft::device_accessor
-    :project: RAFT
 
-.. doxygentypedef:: raft::managed_accessor
-    :project: RAFT
+Layouts
+-------
 
 .. doxygentypedef:: raft::row_major
     :project: RAFT
@@ -25,6 +21,10 @@ Representation
 .. doxygentypedef:: raft::col_major
     :project: RAFT
 
+
+Shapes
+------
+
 .. doxygentypedef:: raft::matrix_extent
     :project: RAFT
 
@@ -43,20 +43,38 @@ Representation
 .. doxygentypedef:: raft::extent_5d
     :project: RAFT
 
-.. doxygentypedef:: raft::dynamic_extent
+.. doxygenfunction:: raft::flatten(mdspan_type mds)
+    :project: RAFT
+
+.. doxygenfunction:: raft:: flatten(const array_interface_type& mda)
+    :project: RAFT
+
+.. doxygenfunction:: raft::reshape(mdspan_type mds, extents<IndexType, Extents...> new_shape)
+    :project: RAFT
+
+.. doxygenfunction:: raft::reshape(const array_interface_type& mda, extents<IndexType, Extents...> new_shape)
     :project: RAFT
 
-.. doxygentypedef:: raft::extents
+
+Accessors
+---------
+
+.. doxygenstruct:: raft::host_device_accessor
     :project: RAFT
+    :members:
 
-.. doxygenfunction:: raft::flatten
+.. doxygentypedef:: raft::host_accessor
     :project: RAFT
 
+.. doxygentypedef:: raft::device_accessor
+    :project: RAFT
 
-.. doxygenfunction:: raft::reshape
+.. doxygentypedef:: raft::managed_accessor
     :project: RAFT
 
 
+
+
 mdarray
 #######
 
@@ -81,7 +99,6 @@ Device Vocabulary
 .. doxygentypedef:: raft::device_mdarray
     :project: RAFT
 
-
 .. doxygentypedef:: raft::device_matrix
     :project: RAFT
 
@@ -136,42 +153,15 @@ mdspan
 .. doxygentypedef:: raft::mdspan
     :project: RAFT
 
-.. doxygenstruct:: raft::is_mdspan
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_mdspan_t
-    :project: RAFT
-
-.. doxygenstruct:: raft::is_input_mdspan
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_input_mdspan_t
-    :project: RAFT
-
-.. doxygenstruct:: raft::is_output_mdspan
-    :project: RAFT
-    :members:
-
-.. doxygentypedef:: raft::is_output_mdspan_t
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_input_mdspan
-    :project: RAFT
-
-.. doxygentypedef:: raft::enable_if_output_mdspan
-    :project: RAFT
-
 .. doxygenfunction:: raft::make_mdspan
     :project: RAFT
 
 .. doxygenfunction:: raft::make_extents
     :project: RAFT
 
+.. doxygenfunction:: raft::make_strided_layout(Extents extents, Strides strides)
+    :project: RAFT
+
 .. doxygenfunction:: raft::unravel_index
     :project: RAFT
 
@@ -185,13 +175,13 @@ Device Vocabulary
 .. doxygenstruct:: raft::is_device_mdspan
    :project: RAFT
 
-.. doxygenstruct:: raft::is_device_mdspan_t
+.. doxygentypedef:: raft::is_device_mdspan_t
    :project: RAFT
 
-.. doxygenstruct:: raft::is_input_device_mdspan_t
+.. doxygentypedef:: raft::is_input_device_mdspan_t
    :project: RAFT
 
-.. doxygenstruct:: raft::is_output_device_mdspan_t
+.. doxygentypedef:: raft::is_output_device_mdspan_t
    :project: RAFT
 
 .. doxygentypedef:: raft::enable_if_device_mdspan
@@ -216,13 +206,10 @@ Device Vocabulary
 Device Factories
 ----------------
 
-.. doxygenfunction:: raft::make_device_mdspan
-    :project: RAFT
-
 .. doxygenfunction:: raft::make_device_matrix_view
     :project: RAFT
 
-.. doxygenfunction:: raft::make_device_vector_view
+.. doxygenfunction:: raft::make_device_vector_view(ElementType* ptr, IndexType n)
     :project: RAFT
 
 .. doxygenfunction:: raft::make_device_scalar_view
@@ -238,13 +225,13 @@ Managed Vocabulary
 .. doxygenstruct:: raft::is_managed_mdspan
    :project: RAFT
 
-.. doxygenstruct:: raft::is_managed_mdspan_t
+.. doxygentypedef:: raft::is_managed_mdspan_t
    :project: RAFT
 
-.. doxygenstruct:: raft::is_input_managed_mdspan_t
+.. doxygentypedef:: raft::is_input_managed_mdspan_t
    :project: RAFT
 
-.. doxygenstruct:: raft::is_output_managed_mdspan_t
+.. doxygentypedef:: raft::is_output_managed_mdspan_t
    :project: RAFT
 
 .. doxygentypedef:: raft::enable_if_managed_mdspan
@@ -256,30 +243,11 @@ Managed Vocabulary
 .. doxygentypedef:: raft::enable_if_output_managed_mdspan
     :project: RAFT
 
-.. doxygentypedef:: raft::managed_matrix_view
-   :project: RAFT
-
-.. doxygentypedef:: raft::managed_vector_view
-   :project: RAFT
-
-.. doxygentypedef:: raft::managed_scalar_view
-   :project: RAFT
-
 
 Managed Factories
 -----------------
 
-.. doxygenfunction:: raft::make_managed_mdspan
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_managed_matrix_view
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_managed_vector_view
-    :project: RAFT
-
-.. doxygenfunction:: raft::make_managed_scalar_view
-   :project: RAFT
+.. doxygenfunction:: make_managed_mdspan(ElementType* ptr, extents<IndexType, Extents...> exts)
 
 
 Host Vocabulary
@@ -291,13 +259,13 @@ Host Vocabulary
 .. doxygenstruct:: raft::is_host_mdspan
    :project: RAFT
 
-.. doxygenstruct:: raft::is_host_mdspan_t
+.. doxygentypedef:: raft::is_host_mdspan_t
    :project: RAFT
 
-.. doxygenstruct:: raft::is_input_host_mdspan_t
+.. doxygentypedef:: raft::is_input_host_mdspan_t
    :project: RAFT
 
-.. doxygenstruct:: raft::is_output_host_mdspan_t
+.. doxygentypedef:: raft::is_output_host_mdspan_t
    :project: RAFT
 
 .. doxygentypedef:: raft::enable_if_host_mdspan
@@ -330,6 +298,40 @@ Host Factories
 .. doxygenfunction:: raft::make_device_scalar_view
     :project: RAFT
 
+
+Validation Routines
+-------------------
+
+.. doxygenstruct:: raft::is_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_mdspan_t
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_input_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_input_mdspan_t
+    :project: RAFT
+
+.. doxygenstruct:: raft::is_output_mdspan
+    :project: RAFT
+    :members:
+
+.. doxygentypedef:: raft::is_output_mdspan_t
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_input_mdspan
+    :project: RAFT
+
+.. doxygentypedef:: raft::enable_if_output_mdspan
+    :project: RAFT
+
 span
 ####
 
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index 962bbd1efe..93eecf68b4 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -3,41 +3,61 @@ Neighbors
 
 This page provides C++ class references for the publicly-exposed elements of the neighbors package.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
 
 Brute-force
 -----------
 
+Header: `raft/neighbors/brute_force.cuh`
+
 .. doxygennamespace:: raft::neighbors::brute_force
     :project: RAFT
+    :members:
+    :content-only:
 
 
 IVF-Flat
 --------
 
+Header: `raft/neighbors/ivf_flat.cuh`
+
 .. doxygennamespace:: raft::neighbors::ivf_flat
     :project: RAFT
     :members:
+    :content-only:
 
 
 IVF-PQ
 --------
 
+Header: `raft/neighbors/ivf_pq.cuh`
+
 .. doxygennamespace:: raft::neighbors::ivf_pq
     :project: RAFT
     :members:
+    :content-only:
 
 
 Epsilon Neighborhood
 --------------------
 
+Header: `raft/neighbors/epsilon_neighborhood.cuh`
+
 .. doxygennamespace:: raft::neighbors::epsilon_neighborhood
     :project: RAFT
     :members:
+    :content-only:
 
 
 Random Ball Cover
 -----------------
 
+Header: `raft/neighbors/ball_cover.cuh`
+
 .. doxygennamespace:: raft::neighbors::ball_cover
     :project: RAFT
     :members:
+    :content-only:
diff --git a/docs/source/cpp_api/random.rst b/docs/source/cpp_api/random.rst
index be2c188617..353f783ed4 100644
--- a/docs/source/cpp_api/random.rst
+++ b/docs/source/cpp_api/random.rst
@@ -3,30 +3,109 @@ Random
 
 This page provides C++ class references for the publicly-exposed elements of the random package.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+Header: `raft/random/rng_state.hpp`
+
+.. doxygenstruct:: raft::random::RngState
+    :project: RAFT
+    :members:
+
+
 Data Generation
 ###############
 
-.. doxygenfunction:: raft::random::make_blobs
+make_blobs
+----------
+
+Header: `raft/random/make_blobs.cuh`
+
+.. doxygenfunction:: raft::random::make_blobs(raft::handle_t const& handle, raft::device_matrix_view<DataT, IdxT, layout> out, raft::device_vector_view<IdxT, IdxT> labels, IdxT n_clusters, std::optional<raft::device_matrix_view<DataT, IdxT, layout>> centers, std::optional<raft::device_vector_view<DataT, IdxT>> const cluster_std, const DataT cluster_std_scalar, bool shuffle, DataT center_box_min, DataT center_box_max, uint64_t seed, GeneratorType type)
+    :project: RAFT
+
+make_regression
+---------------
+
+Header: `raft/random/make_regression.cuh`
+
+.. doxygenfunction:: raft::random::make_regression(const raft::handle_t& handle, raft::device_matrix_view<DataT, IdxT, raft::row_major> out, raft::device_matrix_view<DataT, IdxT, raft::row_major> values, IdxT n_informative, std::optional<raft::device_matrix_view<DataT, IdxT, raft::row_major>> coef, DataT bias, IdxT effective_rank, DataT tail_strength, DataT noise, bool shuffle, uint64_t seed, GeneratorType type)
+    :project: RAFT
+
+rmat
+----
+
+Header: `raft/random/rmat_rectangular_generator.cuh`
+
+.. doxygenfunction:: raft::random::rmat_rectangular_gen(const raft::handle_t& handle, raft::random::RngState& r, raft::device_vector_view<const ProbT, IdxT> theta, raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out, raft::device_vector_view<IdxT, IdxT> out_src, raft::device_vector_view<IdxT, IdxT> out_dst, IdxT r_scale, IdxT c_scale)
+    :project: RAFT
+
+
+Random Sampling
+###############
+
+Distributions
+-------------
+
+Header: `raft/random/rng.cuh`
+
+.. doxygenfunction:: raft::random::uniform(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType start, OutputValueType end)
     :project: RAFT
 
-.. doxygenfunction:: raft::random::make_regression
+.. doxygenfunction:: raft::random::uniformInt(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType start, OutputValueType end)
     :project: RAFT
 
-.. doxygenfunction:: raft::random::rmat_rectangular_gen
+.. doxygenfunction:: raft::random::normal(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
     :project: RAFT
 
+.. doxygenfunction:: raft::random::normalInt(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::normalTable(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<const OutputValueType, IndexType> mu_vec, std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma, raft::device_matrix_view<OutputValueType, IndexType, raft::row_major> out)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::fill(const raft::handle_t& handle, RngState& rng_state, OutputValueType val, raft::device_vector_view<OutputValueType, IndexType> out)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::bernoulli(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, Type prob)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::scaled_bernoulli(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType prob, OutputValueType scale)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::gumbel(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType beta)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::lognormal(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::logistic(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType scale)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::exponential(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType lambda)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::rayleigh(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType sigma)
+    :project: RAFT
+
+.. doxygenfunction:: raft::random::laplace(const raft::handle_t& handle, RngState& rng_state, raft::device_vector_view<OutputValueType, IndexType> out, OutputValueType mu, OutputValueType scale)
+    :project: RAFT
+
+
+Sampling Without Replacement
+----------------------------
 
-Random Number Generation
-########################
+Header: `raft/random/rng.cuh`
 
-.. doxygenclass:: raft::random::Rng
+.. doxygengroup:: sample_without_replacement
     :project: RAFT
     :members:
+    :content-only:
 
-Useful Operations
-#################
+Header: `raft/random/permute.cuh`
 
-.. doxygenfunction:: raft::random::permute
+.. doxygenfunction:: raft::random::permute(const raft::handle_t& handle, raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in, std::optional<raft::device_vector_view<IntType, IdxType>> permsOut, std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
     :project: RAFT
 
 
diff --git a/docs/source/cpp_api/solver.rst b/docs/source/cpp_api/solver.rst
index f7ca244dc8..d03f3bb1eb 100644
--- a/docs/source/cpp_api/solver.rst
+++ b/docs/source/cpp_api/solver.rst
@@ -3,10 +3,16 @@ Solvers
 
 This page provides C++ class references for the publicly-exposed elements of the iterative and combinatorial solvers package.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
 
 Linear Assignment Problem
 #########################
 
+Header: `raft/solver/linear_assignment.cuh`
+
 .. doxygenclass:: raft::solver::LinearAssignmentProblem
     :project: RAFT
     :members:
@@ -14,5 +20,7 @@ Linear Assignment Problem
 Minimum Spanning Tree
 #####################
 
+Header: `raft/sparse/solver/mst.cuh`
+
 .. doxygenfunction:: raft::sparse::solver::mst
     :project: RAFT
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index a7c32cc65d..ea29dbebd0 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -3,6 +3,10 @@ Sparse
 
 This page provides C++ class references for the publicly-exposed elements of the sparse package.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
 
 Conversion
 ##########
@@ -10,6 +14,7 @@ Conversion
 .. doxygennamespace:: raft::sparse::convert
     :project: RAFT
     :members:
+    :content-only:
 
 Distance
 ########
@@ -17,6 +22,7 @@ Distance
 .. doxygennamespace:: raft::sparse::distance
     :project: RAFT
     :members:
+    :content-only:
 
 Linear Algebra
 ##############
@@ -24,6 +30,7 @@ Linear Algebra
 .. doxygennamespace:: raft::sparse::linalg
     :project: RAFT
     :members:
+    :content-only:
 
 Matrix Operations
 #################
@@ -31,10 +38,12 @@ Matrix Operations
 .. doxygennamespace:: raft::sparse::op
     :project: RAFT
     :members:
+    :content-only:
 
-Nearest Neighbors
-#################
+Neighbors
+#########
 
 .. doxygennamespace:: raft::sparse::neighbors
     :project: RAFT
     :members:
+    :content-only:
diff --git a/docs/source/cpp_api/stats.rst b/docs/source/cpp_api/stats.rst
index 8ad8b8a604..f795b9e84c 100644
--- a/docs/source/cpp_api/stats.rst
+++ b/docs/source/cpp_api/stats.rst
@@ -3,6 +3,12 @@ Stats
 
 This page provides C++ class references for the publicly-exposed elements of the stats package.
 
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+
 .. doxygennamespace:: raft::stats
     :project: RAFT
     :members:
+    :content-only:
diff --git a/DEVELOPER_GUIDE.md b/docs/source/developer_guide.md
similarity index 96%
rename from DEVELOPER_GUIDE.md
rename to docs/source/developer_guide.md
index e1dd682fd9..b37d5dc1af 100644
--- a/DEVELOPER_GUIDE.md
+++ b/docs/source/developer_guide.md
@@ -2,7 +2,7 @@
 
 ## Local Development
 
-Devloping features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. 
+Developing features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts.
 
 The process for working on a CUDA/C++ feature which might span RAFT and one or more consuming libraries can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream.
 
@@ -31,4 +31,4 @@ It's important for RAFT to maintain a high test coverage in order to minimize th
 
 ## Documentation
 
-Public APIs always require documentation, since those will be exposed directly to users. In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
\ No newline at end of file
+Public APIs always require documentation, since those will be exposed directly to users. In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c46f08aac6..e66152b904 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,9 +41,11 @@ While not exhaustive, the following general categories help summarize the accele
 
    quick_start.md
    build.md
+   developer_guide.md
    cpp_api.rst
    pylibraft_api.rst
    raft_dask_api.rst
+   contributing.md
 
 
 Indices and tables
diff --git a/docs/source/pylibraft_api.rst b/docs/source/pylibraft_api.rst
index 4df0d9d01c..5c44c5f419 100644
--- a/docs/source/pylibraft_api.rst
+++ b/docs/source/pylibraft_api.rst
@@ -2,12 +2,13 @@
 PyLibRAFT API Reference
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-.. role:: py(code)
-   :language: python
-   :class: highlight
+.. _api:
 
+.. toctree::
+   :maxdepth: 4
 
-Pairwise Distances
-==================
-
-.. autofunction:: pylibraft.distance.pairwise_distance
\ No newline at end of file
+   pylibraft_api/common.rst
+   pylibraft_api/cluster.rst
+   pylibraft_api/distance.rst
+   pylibraft_api/neighbors.rst
+   pylibraft_api/random.rst
\ No newline at end of file
diff --git a/docs/source/pylibraft_api/cluster.rst b/docs/source/pylibraft_api/cluster.rst
new file mode 100644
index 0000000000..1a5aabf48a
--- /dev/null
+++ b/docs/source/pylibraft_api/cluster.rst
@@ -0,0 +1,12 @@
+Cluster
+=======
+
+This page provides pylibraft class references for the publicly-exposed elements of the `pylibraft.cluster` package.
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+.. autofunction:: pylibraft.cluster.compute_new_centroids
+
+
diff --git a/docs/source/pylibraft_api/common.rst b/docs/source/pylibraft_api/common.rst
new file mode 100644
index 0000000000..4070243b22
--- /dev/null
+++ b/docs/source/pylibraft_api/common.rst
@@ -0,0 +1,36 @@
+Common
+======
+
+This page provides `pylibraft` class references for the publicly-exposed elements of the `pylibraft.common` package.
+
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
+Basic Vocabulary
+################
+
+.. autoclass:: pylibraft.common.Handle
+    :members:
+
+.. autoclass:: pylibraft.common.Stream
+    :members:
+
+.. autoclass:: pylibraft.common.device_ndarray
+    :members:
+
+Interruptible
+#############
+
+.. autofunction:: pylibraft.common.interruptible.cuda_interruptible
+.. autofunction:: pylibraft.common.interruptible.synchronize
+.. autofunction:: pylibraft.common.interruptible.cuda_yield
+
+
+CUDA Array Interface Helpers
+############################
+
+.. autoclass:: pylibraft.common.cai_wrapper
+    :members:
diff --git a/docs/source/pylibraft_api/distance.rst b/docs/source/pylibraft_api/distance.rst
new file mode 100644
index 0000000000..d14ed6fc08
--- /dev/null
+++ b/docs/source/pylibraft_api/distance.rst
@@ -0,0 +1,15 @@
+Distance
+========
+
+This page provides `pylibraft` class references for the publicly-exposed elements of the `pylibraft.distance` package. RAFT's
+distances have been highly optimized and support a wide assortment of different distance measures.
+
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+.. autofunction:: pylibraft.distance.pairwise_distance
+
+.. autofunction:: pylibraft.distance.fused_l2_nn_argmin
+
diff --git a/docs/source/pylibraft_api/neighbors.rst b/docs/source/pylibraft_api/neighbors.rst
new file mode 100644
index 0000000000..7112a3878c
--- /dev/null
+++ b/docs/source/pylibraft_api/neighbors.rst
@@ -0,0 +1,18 @@
+Neighbors
+=========
+
+This page provides pylibraft class references for the publicly-exposed elements of the neighbors package.
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+.. autoclass:: pylibraft.neighbors.ivf_pq.IndexParams
+
+.. autofunction:: pylibraft.neighbors.ivf_pq.build
+
+.. autofunction:: pylibraft.neighbors.ivf_pq.extend
+
+.. autoclass:: pylibraft.neighbors.ivf_pq.SearchParams
+
+.. autofunction:: pylibraft.neighbors.ivf_pq.search
diff --git a/docs/source/pylibraft_api/random.rst b/docs/source/pylibraft_api/random.rst
new file mode 100644
index 0000000000..538d932757
--- /dev/null
+++ b/docs/source/pylibraft_api/random.rst
@@ -0,0 +1,12 @@
+Random
+======
+
+This page provides pylibraft class references for the publicly-exposed elements of the `pylibraft.random` package.
+
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
+.. autofunction:: pylibraft.random.rmat
\ No newline at end of file
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 38ed031759..60071f2461 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -1,6 +1,6 @@
 # Quick Start
 
-This guide is meant to provide a quick-start tutorial for interacting with RAFT's C++ APIs.
+This guide is meant to provide a quick-start tutorial for interacting with RAFT's C++ & Python APIs.
 
 ## RAPIDS Memory Manager (RMM)
 
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..dfd22f3378
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.black]
+line-length = 79
+target-version = ["py38"]
+include = '\.py?$'
+force-exclude = '''
+/(
+    thirdparty |
+    \.eggs |
+    \.git |
+    \.hg |
+    \.mypy_cache |
+    \.tox |
+    \.venv |
+    _build |
+    buck-out |
+    build |
+    dist
+)/
+'''
diff --git a/python/.flake8 b/python/.flake8
deleted file mode 100644
index ef2e5a8495..0000000000
--- a/python/.flake8
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-[flake8]
-exclude = __init__.py
-ignore =
-    # line break before binary operator
-    W503
-    # whitespace before :
-    E203
\ No newline at end of file
diff --git a/python/.flake8.cython b/python/.flake8.cython
deleted file mode 100644
index 3cd436d3f3..0000000000
--- a/python/.flake8.cython
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-[flake8]
-filename = *.pyx, *.pxd
-exclude = *.egg, build, docs, .git
-ignore = E999, E225, E226, E227, W503, W504
-
-# Rules ignored:
-# E999: invalid syntax (works for Python, not Cython)
-# E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-# E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-# E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-# W503: line break before binary operator (breaks lines that start with a pointer)
-# W504: line break after binary operator (breaks lines that end with a pointer)
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index e10adc8631..3efc3a547b 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -25,18 +25,25 @@ project(
             # language to be enabled here. The test project that is built in scikit-build to verify
             # various linking options for the python library is hardcoded to build with C, so until
             # that is fixed we need to keep C.
-            C
-            CXX)
+            C CXX
+)
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files" ON)
+option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
+       ON
+)
+
+option(RAFT_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 
 # If the user requested it we attempt to find RAFT.
 if(FIND_RAFT_CPP)
-    find_package(raft ${pylibraft_version} REQUIRED COMPONENTS distance)
-    if(NOT TARGET raft::raft_distance_lib)
-        message(FATAL_ERROR "Building against a preexisting libraft library requires the distance components of that library to have been built!")
+  find_package(raft ${pylibraft_version} REQUIRED COMPONENTS distance)
+  if(NOT TARGET raft::raft_distance_lib)
+    message(
+      FATAL_ERROR
+        "Building against a preexisting libraft library requires the distance components of that library to have been built!"
+    )
 
-    endif()
+  endif()
 else()
   set(raft_FOUND OFF)
 endif()
@@ -44,9 +51,8 @@ endif()
 include(rapids-cython)
 
 if(NOT raft_FOUND)
-  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will
-  # pull in the required languages for the C++ project even if this project
-  # does not require those languges.
+  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
+  # languages for the C++ project even if this project does not require those languages.
   include(rapids-cuda)
   rapids_cuda_init_architectures(pylibraft)
   enable_language(CUDA)
@@ -58,11 +64,20 @@ if(NOT raft_FOUND)
   set(BUILD_BENCH OFF)
   set(RAFT_COMPILE_LIBRARIES OFF)
   set(RAFT_COMPILE_DIST_LIBRARY ON)
-  add_subdirectory(../../cpp raft-cpp)
 
-  # When building the C++ libraries from source we must copy
-  # libraft_distance.so alongside the pairwise_distance and random Cython libraries
-  # TODO: when we have a single 'compiled' raft library, we shouldn't need this
+  set(_exclude_from_all "")
+  if(RAFT_BUILD_WHEELS)
+    # Statically link dependencies if building wheels
+    set(CUDA_STATIC_RUNTIME ON)
+    # Don't install the raft C++ targets into wheels
+    set(_exclude_from_all EXCLUDE_FROM_ALL)
+  endif()
+
+  add_subdirectory(../../cpp raft-cpp ${_exclude_from_all})
+
+  # When building the C++ libraries from source we must copy libraft_distance.so alongside the
+  # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
+  # library, we shouldn't need this
   set(cython_lib_dir pylibraft)
   install(TARGETS raft_distance_lib DESTINATION ${cython_lib_dir})
 endif()
@@ -71,6 +86,7 @@ rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
 add_subdirectory(pylibraft/distance)
+add_subdirectory(pylibraft/neighbors)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/cluster)
 
diff --git a/python/pylibraft/LICENSE b/python/pylibraft/LICENSE
new file mode 120000
index 0000000000..30cff7403d
--- /dev/null
+++ b/python/pylibraft/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/pylibraft/_custom_build/backend.py b/python/pylibraft/_custom_build/backend.py
new file mode 100644
index 0000000000..7d1b334626
--- /dev/null
+++ b/python/pylibraft/_custom_build/backend.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+"""Custom build backend for pylibraft to get versioned requirements.
+
+Based on https://setuptools.pypa.io/en/latest/build_meta.html
+"""
+import os
+from functools import wraps
+
+from setuptools import build_meta as _orig
+
+# Alias the required bits
+build_wheel = _orig.build_wheel
+build_sdist = _orig.build_sdist
+
+
+def replace_requirements(func):
+    @wraps(func)
+    def wrapper(config_settings=None):
+        orig_list = getattr(_orig, func.__name__)(config_settings)
+        append_list = [
+            f"rmm{os.getenv('RAPIDS_PY_WHEEL_CUDA_SUFFIX', default='')}"
+        ]
+        return orig_list + append_list
+
+    return wrapper
+
+
+get_requires_for_build_wheel = replace_requirements(
+    _orig.get_requires_for_build_wheel
+)
+get_requires_for_build_sdist = replace_requirements(
+    _orig.get_requires_for_build_sdist
+)
+get_requires_for_build_editable = replace_requirements(
+    _orig.get_requires_for_build_editable
+)
diff --git a/python/pylibraft/pylibraft/_version.py b/python/pylibraft/pylibraft/_version.py
index 58cd44da3b..81de84e96e 100644
--- a/python/pylibraft/pylibraft/_version.py
+++ b/python/pylibraft/pylibraft/_version.py
@@ -70,7 +70,7 @@ def decorate(f):
 
 
 def run_command(
-        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+    commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
 ):
     """Call the given command(s)."""
     assert isinstance(commands, list)
@@ -85,7 +85,7 @@ def run_command(
                 env=env,
                 stdout=subprocess.PIPE,
                 stderr=(subprocess.PIPE if hide_stderr else None),
-                )
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -123,7 +123,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
             return {
-                "version": dirname[len(parentdir_prefix):],
+                "version": dirname[len(parentdir_prefix) :],
                 "full-revisionid": None,
                 "dirty": False,
                 "error": None,
@@ -193,7 +193,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -210,7 +210,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
             return {
@@ -264,7 +264,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             "--long",
             "--match",
             "%s*" % tag_prefix,
-            ],
+        ],
         cwd=root,
     )
     # --long was added in git-1.5.5
@@ -299,7 +299,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
             pieces["error"] = (
-                    "unable to parse git-describe output: '%s'" % describe_out
+                "unable to parse git-describe output: '%s'" % describe_out
             )
             return pieces
 
@@ -314,7 +314,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
                 tag_prefix,
             )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
diff --git a/python/pylibraft/pylibraft/cluster/CMakeLists.txt b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
index 44e34e0213..ba77403a5d 100644
--- a/python/pylibraft/pylibraft/cluster/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
@@ -14,15 +14,11 @@
 
 # Set the list of Cython files to build
 set(cython_sources kmeans.pyx)
-set(linked_libraries raft::raft raft::distance)
+set(linked_libraries raft::distance)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
-        CXX
-        SOURCE_FILES "${cython_sources}"
-        LINKED_LIBRARIES "${linked_libraries}"
-        MODULE_PREFIX cluster_)
-
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../library")
-endforeach()
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX cluster_
+)
diff --git a/python/pylibraft/pylibraft/cluster/__init__.py b/python/pylibraft/pylibraft/cluster/__init__.py
index 273b4497cc..89a403fce2 100644
--- a/python/pylibraft/pylibraft/cluster/__init__.py
+++ b/python/pylibraft/pylibraft/cluster/__init__.py
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from .kmeans import compute_new_centroids
diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx
index 940d118c3b..679523cef4 100644
--- a/python/pylibraft/pylibraft/cluster/kmeans.pyx
+++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx
@@ -20,18 +20,23 @@
 
 import numpy as np
 
-from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
-
-from libcpp cimport bool
-from libcpp cimport nullptr
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool, nullptr
 
 from pylibraft.common import Handle
 from pylibraft.common.handle import auto_sync_handle
+
 from pylibraft.common.handle cimport handle_t
+
 from pylibraft.common.input_validation import *
 from pylibraft.distance import DISTANCE_TYPES
 
+from pylibraft.cpp.kmeans cimport (
+    cluster_cost as cpp_cluster_cost,
+    update_centroids,
+)
+
 
 def is_c_cont(cai, dt):
     return "strides" not in cai or \
@@ -39,34 +44,6 @@ def is_c_cont(cai, dt):
         cai["strides"][1] == dt.itemsize
 
 
-cdef extern from "raft_distance/kmeans.hpp" \
-        namespace "raft::cluster::kmeans::runtime":
-
-    cdef void update_centroids(
-        const handle_t& handle,
-        const double *X,
-        int n_samples,
-        int n_features,
-        int n_clusters,
-        const double *sample_weights,
-        const double *centroids,
-        const int* labels,
-        double *new_centroids,
-        double *weight_per_cluster) except +
-
-    cdef void update_centroids(
-        const handle_t& handle,
-        const float *X,
-        int n_samples,
-        int n_features,
-        int n_clusters,
-        const float *sample_weights,
-        const float *centroids,
-        const int* labels,
-        float *new_centroids,
-        float *weight_per_cluster) except +
-
-
 @auto_sync_handle
 def compute_new_centroids(X,
                           centroids,
@@ -110,7 +87,6 @@ def compute_new_centroids(X,
 
         from pylibraft.common import Handle
         from pylibraft.cluster.kmeans import compute_new_centroids
-        from pylibraft.distance import fused_l2_nn_argmin
 
         # A single RAFT handle can optionally be reused across
         # pylibraft functions.
@@ -221,3 +197,91 @@ def compute_new_centroids(X,
                          <double*> weight_per_cluster_ptr)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
+
+
+@auto_sync_handle
+def cluster_cost(X, centroids, handle=None):
+    """
+    Compute cluster cost given an input matrix and existing centroids
+
+    Parameters
+    ----------
+    X : Input CUDA array interface compliant matrix shape (m, k)
+    centroids : Input CUDA array interface compliant matrix shape
+                    (n_clusters, k)
+    {handle_docstring}
+
+    Examples
+    --------
+
+    .. code-block:: python
+        import cupy as cp
+
+        from pylibraft.cluster.kmeans import cluster_cost
+
+        n_samples = 5000
+        n_features = 50
+        n_clusters = 3
+
+        X = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+
+        centroids = cp.random.random_sample((n_clusters, n_features),
+                                                dtype=cp.float32)
+
+        inertia = cluster_cost(X, centroids)
+    """
+    x_cai = X.__cuda_array_interface__
+    centroids_cai = centroids.__cuda_array_interface__
+
+    m = x_cai["shape"][0]
+    x_k = x_cai["shape"][1]
+    n_clusters = centroids_cai["shape"][0]
+
+    centroids_k = centroids_cai["shape"][1]
+
+    x_dt = np.dtype(x_cai["typestr"])
+    centroids_dt = np.dtype(centroids_cai["typestr"])
+
+    if not do_cols_match(X, centroids):
+        raise ValueError("X and centroids must have same number of columns.")
+
+    x_ptr = <uintptr_t>x_cai["data"][0]
+    centroids_ptr = <uintptr_t>centroids_cai["data"][0]
+
+    handle = handle if handle is not None else Handle()
+    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+
+    x_c_contiguous = is_c_cont(x_cai, x_dt)
+    centroids_c_contiguous = is_c_cont(centroids_cai, centroids_dt)
+
+    if not x_c_contiguous or not centroids_c_contiguous:
+        raise ValueError("Inputs must all be c contiguous")
+
+    if not do_dtypes_match(X, centroids):
+        raise ValueError("Inputs must all have the same dtypes "
+                         "(float32 or float64)")
+
+    cdef float f_cost = 0
+    cdef double d_cost = 0
+
+    if x_dt == np.float32:
+        cpp_cluster_cost(deref(h),
+                         <float*> x_ptr,
+                         <int> m,
+                         <int> x_k,
+                         <int> n_clusters,
+                         <float*> centroids_ptr,
+                         <float*> &f_cost)
+        return f_cost
+    elif x_dt == np.float64:
+        cpp_cluster_cost(deref(h),
+                         <double*> x_ptr,
+                         <int> m,
+                         <int> x_k,
+                         <int> n_clusters,
+                         <double*> centroids_ptr,
+                         <double*> &d_cost)
+        return d_cost
+    else:
+        raise ValueError("dtype %s not supported" % x_dt)
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index f8d86e0386..3b49cef429 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -18,8 +18,7 @@ set(linked_libraries raft::raft)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
-        CXX
-        SOURCE_FILES "${cython_sources}"
-        LINKED_LIBRARIES "${linked_libraries}"
-        ASSOCIATED_TARGETS raft
-        MODULE_PREFIX common_)
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX common_
+)
diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
index 7872599a78..33c2986487 100644
--- a/python/pylibraft/pylibraft/common/__init__.py
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -13,5 +13,7 @@
 # limitations under the License.
 #
 
+from .cai_wrapper import cai_wrapper
 from .cuda import Stream
+from .device_ndarray import device_ndarray
 from .handle import Handle
diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py
new file mode 100644
index 0000000000..fdfc6b0b09
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cai_wrapper.py
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+
+from pylibraft.common import input_validation
+
+
+class cai_wrapper:
+    """
+    Simple wrapper around a CUDA array interface object to reduce
+    boilerplate for extracting common information from the underlying
+    dictionary.
+    """
+
+    def __init__(self, cai_arr):
+        """
+        Constructor accepts a CUDA array interface compliant array
+
+        Parameters
+        ----------
+        cai_arr : CUDA array interface array
+        """
+        self.cai_ = cai_arr.__cuda_array_interface__
+
+    @property
+    def dtype(self):
+        """
+        Returns the dtype of the underlying CUDA array interface
+        """
+        return np.dtype(self.cai_["typestr"])
+
+    @property
+    def shape(self):
+        """
+        Returns the shape of the underlying CUDA array interface
+        """
+        return self.cai_["shape"]
+
+    @property
+    def c_contiguous(self):
+        """
+        Returns whether the underlying CUDA array interface has
+        c-ordered (row-major) layout
+        """
+        return input_validation.is_c_contiguous(self.cai_)
+
+    @property
+    def f_contiguous(self):
+        """
+        Returns whether the underlying CUDA array interface has
+        f-ordered (column-major) layout
+        """
+        return not input_validation.is_c_contiguous(self.cai_)
+
+    @property
+    def data(self):
+        """
+        Returns the data pointer of the underlying CUDA array interface
+        """
+        return self.cai_["data"][0]
diff --git a/python/pylibraft/pylibraft/common/cuda.pxd b/python/pylibraft/pylibraft/common/cuda.pxd
index ae6246dee1..a44d9aeb63 100644
--- a/python/pylibraft/pylibraft/common/cuda.pxd
+++ b/python/pylibraft/pylibraft/common/cuda.pxd
@@ -16,6 +16,7 @@
 
 from cuda.ccudart cimport cudaStream_t
 
+
 cdef class Stream:
     cdef cudaStream_t s
 
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
index 9b35aebdba..606860dbe9 100644
--- a/python/pylibraft/pylibraft/common/cuda.pyx
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -19,16 +19,16 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cuda.ccudart cimport(
-    cudaStream_t,
+from cuda.ccudart cimport (
     cudaError_t,
-    cudaSuccess,
+    cudaGetErrorName,
+    cudaGetErrorString,
+    cudaGetLastError,
+    cudaStream_t,
     cudaStreamCreate,
     cudaStreamDestroy,
     cudaStreamSynchronize,
-    cudaGetLastError,
-    cudaGetErrorString,
-    cudaGetErrorName
+    cudaSuccess,
 )
 
 
diff --git a/python/pylibraft/pylibraft/common/device_ndarray.py b/python/pylibraft/pylibraft/common/device_ndarray.py
new file mode 100644
index 0000000000..eebbca2f06
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/device_ndarray.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+
+import rmm
+
+
+class device_ndarray:
+    """
+    pylibraft.common.device_ndarray is meant to be a very lightweight
+    __cuda_array_interface__ wrapper around a numpy.ndarray.
+    """
+
+    def __init__(self, np_ndarray):
+        """
+        Construct a pylibraft.common.device_ndarray wrapper around a
+        numpy.ndarray
+
+        Parameters
+        ----------
+        ndarray : A numpy.ndarray which will be copied and moved to the device
+
+        Examples
+        --------
+        The device_ndarray is __cuda_array_interface__ compliant so it is
+        interoperable with other libraries that also support it, such as
+        CuPy and PyTorch.
+
+        The following usage example demonstrates
+        converting a pylibraft.common.device_ndarray to a cupy.ndarray:
+        .. code-block:: python
+
+            import cupy as cp
+            from pylibraft.common import device_ndarray
+
+            raft_array = device_ndarray.empty((100, 50))
+            cupy_array = cp.asarray(raft_array)
+
+        And the converting pylibraft.common.device_ndarray to a PyTorch tensor:
+        .. code-block:: python
+
+            import torch
+            from pylibraft.common import device_ndarray
+
+            raft_array = device_ndarray.empty((100, 50))
+            torch_tensor = torch.as_tensor(raft_array, device='cuda')
+        """
+        self.ndarray_ = np_ndarray
+        order = "C" if self.c_contiguous else "F"
+        self.device_buffer_ = rmm.DeviceBuffer.to_device(
+            self.ndarray_.tobytes(order=order)
+        )
+
+    @classmethod
+    def empty(cls, shape, dtype=np.float32, order="C"):
+        """
+        Return a new device_ndarray of given shape and type, without
+        initializing entries.
+
+        Parameters
+        ----------
+        shape : int or tuple of int
+                Shape of the empty array, e.g., (2, 3) or 2.
+        dtype : data-type, optional
+                Desired output data-type for the array, e.g, numpy.int8.
+                Default is numpy.float32.
+        order : {'C', 'F'}, optional (default: 'C')
+                Whether to store multi-dimensional dat ain row-major (C-style)
+                or column-major (Fortran-style) order in memory
+        """
+        arr = np.empty(shape, dtype=dtype, order=order)
+        return cls(arr)
+
+    @property
+    def c_contiguous(self):
+        """
+        Is the current device_ndarray laid out in row-major format?
+        """
+        array_interface = self.ndarray_.__array_interface__
+        strides = self.strides
+        return (
+            strides is None
+            or array_interface["strides"][1] == self.dtype.itemsize
+        )
+
+    @property
+    def f_contiguous(self):
+        """
+        Is the current device_ndarray laid out in column-major format?
+        """
+        return not self.c_contiguous
+
+    @property
+    def dtype(self):
+        """
+        Datatype of the current device_ndarray instance
+        """
+        array_interface = self.ndarray_.__array_interface__
+        return np.dtype(array_interface["typestr"])
+
+    @property
+    def shape(self):
+        """
+        Shape of the current device_ndarray instance
+        """
+        array_interface = self.ndarray_.__array_interface__
+        return array_interface["shape"]
+
+    @property
+    def strides(self):
+        """
+        Strides of the current device_ndarray instance
+        """
+        array_interface = self.ndarray_.__array_interface__
+        return (
+            None
+            if "strides" not in array_interface
+            else array_interface["strides"]
+        )
+
+    @property
+    def __cuda_array_interface__(self):
+        """
+        Returns the __cuda_array_interface__ compliant dict for
+        integrating with other device-enabled libraries using
+        zero-copy semantics.
+        """
+        device_cai = self.device_buffer_.__cuda_array_interface__
+        host_cai = self.ndarray_.__array_interface__.copy()
+        host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
+
+        return host_cai
+
+    def copy_to_host(self):
+        """
+        Returns a new numpy.ndarray object on host with the current contents of
+        this device_ndarray
+        """
+        ret = np.frombuffer(
+            self.device_buffer_.tobytes(),
+            dtype=self.dtype,
+            like=self.ndarray_,
+        ).astype(self.dtype)
+        ret = np.lib.stride_tricks.as_strided(ret, self.shape, self.strides)
+        return ret
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index d7abe236ba..e763768eac 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -20,11 +20,11 @@
 # cython: language_level = 3
 
 
-from libcpp.memory cimport shared_ptr
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from libcpp.memory cimport shared_ptr, unique_ptr
+
 from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
-from libcpp.memory cimport shared_ptr
-from libcpp.memory cimport unique_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
 
 cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index 3e9ed569ad..c14c22f5aa 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -21,10 +21,10 @@
 
 import functools
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread, cuda_stream_view
 
 from .cuda cimport Stream
+
 from .cuda import CudaRuntimeError
 
 
diff --git a/python/pylibraft/pylibraft/common/input_validation.py b/python/pylibraft/pylibraft/common/input_validation.py
index d5556a79dc..61435a859c 100644
--- a/python/pylibraft/pylibraft/common/input_validation.py
+++ b/python/pylibraft/pylibraft/common/input_validation.py
@@ -18,6 +18,8 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+import numpy as np
+
 
 def do_dtypes_match(*cais):
     last_dtype = cais[0].__cuda_array_interface__["typestr"]
@@ -57,3 +59,20 @@ def do_shapes_match(*cais):
             return False
         last_shape = shape
     return True
+
+
+def is_c_contiguous(cai):
+    """
+    Checks whether an array is C contiguous.
+
+    Parameters
+    ----------
+    cai : CUDA array interface
+
+    """
+    dt = np.dtype(cai["typestr"])
+    return (
+        "strides" not in cai
+        or cai["strides"] is None
+        or cai["strides"][1] == dt.itemsize
+    )
diff --git a/python/pylibraft/pylibraft/common/interruptible.pxd b/python/pylibraft/pylibraft/common/interruptible.pxd
index a4c7d90ac2..aaccf8aeab 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pxd
+++ b/python/pylibraft/pylibraft/common/interruptible.pxd
@@ -20,8 +20,10 @@
 # cython: language_level = 3
 
 from libcpp.memory cimport shared_ptr
+
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
+
 cdef extern from "raft/core/interruptible.hpp" namespace "raft" nogil:
     cdef cppclass interruptible:
         void cancel()
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
index dfc95490ed..fc2e6d9e1f 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pyx
+++ b/python/pylibraft/pylibraft/common/interruptible.pyx
@@ -21,10 +21,12 @@
 
 import contextlib
 import signal
+
+from cuda.ccudart cimport cudaStream_t
 from cython.operator cimport dereference
 
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from cuda.ccudart cimport cudaStream_t
+
 from .cuda cimport Stream
 
 
diff --git a/python/pylibraft/pylibraft/cpp/__init__.pxd b/python/pylibraft/pylibraft/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/cpp/__init__.py b/python/pylibraft/pylibraft/cpp/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/cpp/kmeans.pxd b/python/pylibraft/pylibraft/cpp/kmeans.pxd
new file mode 100644
index 0000000000..b263952522
--- /dev/null
+++ b/python/pylibraft/pylibraft/cpp/kmeans.pxd
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from pylibraft.common.handle cimport handle_t
+
+
+cdef extern from "raft_distance/kmeans.hpp" \
+        namespace "raft::cluster::kmeans::runtime":
+
+    cdef void update_centroids(
+        const handle_t& handle,
+        const double *X,
+        int n_samples,
+        int n_features,
+        int n_clusters,
+        const double *sample_weights,
+        const double *centroids,
+        const int* labels,
+        double *new_centroids,
+        double *weight_per_cluster) except +
+
+    cdef void update_centroids(
+        const handle_t& handle,
+        const float *X,
+        int n_samples,
+        int n_features,
+        int n_clusters,
+        const float *sample_weights,
+        const float *centroids,
+        const int* labels,
+        float *new_centroids,
+        float *weight_per_cluster) except +
+
+    cdef void cluster_cost(
+        const handle_t& handle,
+        const float* X,
+        int n_samples,
+        int n_features,
+        int n_clusters,
+        const float * centroids,
+        float * cost) except +
+
+    cdef void cluster_cost(
+        const handle_t& handle,
+        const double* X,
+        int n_samples,
+        int n_features,
+        int n_clusters,
+        const double * centroids,
+        double * cost) except +
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index 01cdbd0876..cae00007d6 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -13,14 +13,12 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_sources pairwise_distance.pyx
-                   fused_l2_nn.pyx)
+set(cython_sources pairwise_distance.pyx fused_l2_nn.pyx)
 set(linked_libraries raft::raft raft::distance)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
-    CXX
-    SOURCE_FILES "${cython_sources}"
-    LINKED_LIBRARIES "${linked_libraries}"
-    ASSOCIATED_TARGETS raft
-    MODULE_PREFIX distance_)
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX distance_
+)
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
index 7d68f02936..b251e71ba3 100644
--- a/python/pylibraft/pylibraft/distance/__init__.py
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -14,5 +14,4 @@
 #
 
 from .fused_l2_nn import fused_l2_nn_argmin
-from .pairwise_distance import distance as pairwise_distance
-from .pairwise_distance import DISTANCE_TYPES
\ No newline at end of file
+from .pairwise_distance import DISTANCE_TYPES, distance as pairwise_distance
diff --git a/python/pylibraft/pylibraft/distance/distance_type.pxd b/python/pylibraft/pylibraft/distance/distance_type.pxd
index ab865670bb..e058238d45 100644
--- a/python/pylibraft/pylibraft/distance/distance_type.pxd
+++ b/python/pylibraft/pylibraft/distance/distance_type.pxd
@@ -18,7 +18,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
+cdef extern from "raft/distance/distance_types.hpp" namespace "raft::distance":
 
     ctypedef enum DistanceType:
         L2Expanded "raft::distance::DistanceType::L2Expanded"
diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
index 75b9670005..3f30e6d0a8 100644
--- a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -20,20 +20,16 @@
 
 import numpy as np
 
-from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
-
+from libc.stdint cimport uintptr_t
 from libcpp cimport bool
+
 from .distance_type cimport DistanceType
-from pylibraft.common import Handle
-from pylibraft.common.handle import auto_sync_handle
-from pylibraft.common.handle cimport handle_t
 
+from pylibraft.common import Handle, cai_wrapper, device_ndarray
+from pylibraft.common.handle import auto_sync_handle
 
-def is_c_cont(cai, dt):
-    return "strides" not in cai or \
-        cai["strides"] is None or \
-        cai["strides"][1] == dt.itemsize
+from pylibraft.common.handle cimport handle_t
 
 
 cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
@@ -61,7 +57,7 @@ cdef extern from "raft_distance/fused_l2_min_arg.hpp" \
 
 
 @auto_sync_handle
-def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
+def fused_l2_nn_argmin(X, Y, out=None, sqrt=True, handle=None):
     """
     Compute the 1-nearest neighbors between X and Y using the L2 distance
 
@@ -75,63 +71,93 @@ def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
 
     Examples
     --------
+    To compute the 1-nearest neighbors argmin:
 
     .. code-block:: python
 
         import cupy as cp
-
         from pylibraft.common import Handle
         from pylibraft.distance import fused_l2_nn_argmin
-
         n_samples = 5000
         n_clusters = 5
         n_features = 50
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        in2 = cp.random.random_sample((n_clusters, n_features),
+                                      dtype=cp.float32)
+        # A single RAFT handle can optionally be reused across
+        # pylibraft functions.
+        handle = Handle()
+        ...
+        output = fused_l2_nn_argmin(in1, in2, output, handle=handle)
+        ...
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
+
+    The output can also be computed in-place on a preallocated
+    array:
+
+    .. code-block:: python
 
+        import cupy as cp
+        from pylibraft.common import Handle
+        from pylibraft.distance import fused_l2_nn_argmin
+        n_samples = 5000
+        n_clusters = 5
+        n_features = 50
         in1 = cp.random.random_sample((n_samples, n_features),
                                       dtype=cp.float32)
         in2 = cp.random.random_sample((n_clusters, n_features),
                                       dtype=cp.float32)
         output = cp.empty((n_samples, 1), dtype=cp.int32)
-
         # A single RAFT handle can optionally be reused across
         # pylibraft functions.
         handle = Handle()
         ...
-        fused_l2_nn_argmin(in1, in2, output, handle=handle)
+        fused_l2_nn_argmin(in1, in2, out=output, handle=handle)
         ...
         # pylibraft functions are often asynchronous so the
         # handle needs to be explicitly synchronized
         handle.sync()
+
    """
 
-    x_cai = X.__cuda_array_interface__
-    y_cai = Y.__cuda_array_interface__
-    output_cai = output.__cuda_array_interface__
+    x_cai = cai_wrapper(X)
+    y_cai = cai_wrapper(Y)
 
-    m = x_cai["shape"][0]
-    n = y_cai["shape"][0]
+    x_dt = x_cai.dtype
+    y_dt = y_cai.dtype
 
-    x_k = x_cai["shape"][1]
-    y_k = y_cai["shape"][1]
+    m = x_cai.shape[0]
+    n = y_cai.shape[0]
+
+    if out is None:
+        output = device_ndarray.empty((m,), dtype="int32")
+    else:
+        output = out
+
+    output_cai = cai_wrapper(output)
+
+    x_k = x_cai.shape[1]
+    y_k = y_cai.shape[1]
 
     if x_k != y_k:
         raise ValueError("Inputs must have same number of columns. "
                          "a=%s, b=%s" % (x_k, y_k))
 
-    x_ptr = <uintptr_t>x_cai["data"][0]
-    y_ptr = <uintptr_t>y_cai["data"][0]
+    x_ptr = <uintptr_t>x_cai.data
+    y_ptr = <uintptr_t>y_cai.data
 
-    d_ptr = <uintptr_t>output_cai["data"][0]
+    d_ptr = <uintptr_t>output_cai.data
 
     handle = handle if handle is not None else Handle()
     cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
 
-    x_dt = np.dtype(x_cai["typestr"])
-    y_dt = np.dtype(y_cai["typestr"])
-    d_dt = np.dtype(output_cai["typestr"])
+    d_dt = output_cai.dtype
 
-    x_c_contiguous = is_c_cont(x_cai, x_dt)
-    y_c_contiguous = is_c_cont(y_cai, y_dt)
+    x_c_contiguous = x_cai.c_contiguous
+    y_c_contiguous = y_cai.c_contiguous
 
     if x_c_contiguous != y_c_contiguous:
         raise ValueError("Inputs must have matching strides")
@@ -161,3 +187,5 @@ def fused_l2_nn_argmin(X, Y, output, sqrt=True, handle=None):
                             <bool>sqrt)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
+
+    return output
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index b6b73a86a1..76e70e3926 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -20,21 +20,18 @@
 
 import numpy as np
 
-from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
-
+from libc.stdint cimport uintptr_t
 from libcpp cimport bool
+
 from .distance_type cimport DistanceType
 
 from pylibraft.common import Handle
 from pylibraft.common.handle import auto_sync_handle
-from pylibraft.common.handle cimport handle_t
 
+from pylibraft.common.handle cimport handle_t
 
-def is_c_cont(cai, dt):
-    return "strides" not in cai or \
-        cai["strides"] is None or \
-        cai["strides"][1] == dt.itemsize
+from pylibraft.common import cai_wrapper, device_ndarray
 
 
 cdef extern from "raft_distance/pairwise_distance.hpp" \
@@ -92,7 +89,7 @@ SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product",
 
 
 @auto_sync_handle
-def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
+def distance(X, Y, out=None, metric="euclidean", p=2.0, handle=None):
     """
     Compute pairwise distances between X and Y
 
@@ -107,68 +104,113 @@ def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
 
     X : CUDA array interface compliant matrix shape (m, k)
     Y : CUDA array interface compliant matrix shape (n, k)
-    dists : Writable CUDA array interface matrix shape (m, n)
+    out : Optional writable CUDA array interface matrix shape (m, n)
     metric : string denoting the metric type (default="euclidean")
     p : metric parameter (currently used only for "minkowski")
     {handle_docstring}
 
+    Returns
+    -------
+
+    raft.device_ndarray containing pairwise distances
+
     Examples
     --------
+    To compute pairwise distances on cupy arrays:
 
     .. code-block:: python
 
         import cupy as cp
-
         from pylibraft.common import Handle
         from pylibraft.distance import pairwise_distance
-
         n_samples = 5000
         n_features = 50
-
         in1 = cp.random.random_sample((n_samples, n_features),
                                       dtype=cp.float32)
         in2 = cp.random.random_sample((n_samples, n_features),
                                       dtype=cp.float32)
+
+    A single RAFT handle can optionally be reused across
+    pylibraft functions.
+
+    .. code-block:: python
+
+        handle = Handle()
+        output = pairwise_distance(in1, in2, metric="euclidean", handle=handle)
+
+    pylibraft functions are often asynchronous so the
+    handle needs to be explicitly synchronized
+
+    .. code-block:: python
+
+        handle.sync()
+
+    It's also possible to write to a pre-allocated output array:
+
+    .. code-block:: python
+
+        import cupy as cp
+        from pylibraft.common import Handle
+        from pylibraft.distance import pairwise_distance
+        n_samples = 5000
+        n_features = 50
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                     dtype=cp.float32)
+        in2 = cp.random.random_sample((n_samples, n_features),
+                                     dtype=cp.float32)
         output = cp.empty((n_samples, n_samples), dtype=cp.float32)
 
-        # A single RAFT handle can optionally be reused across
-        # pylibraft functions.
+    A single RAFT handle can optionally be reused across
+    pylibraft functions.
+
+    .. code-block:: python
+
         handle = Handle()
-        ...
-        pairwise_distance(in1, in2, output, metric="euclidean", handle=handle)
-        ...
-        # pylibraft functions are often asynchronous so the
-        # handle needs to be explicitly synchronized
+        pairwise_distance(in1, in2, out=output,
+                         metric="euclidean", handle=handle)
+
+    pylibraft functions are often asynchronous so the
+    handle needs to be explicitly synchronized
+
+    .. code-block:: python
+
         handle.sync()
-   """
+    """
 
-    x_cai = X.__cuda_array_interface__
-    y_cai = Y.__cuda_array_interface__
-    dists_cai = dists.__cuda_array_interface__
+    x_cai = cai_wrapper(X)
+    y_cai = cai_wrapper(Y)
 
-    m = x_cai["shape"][0]
-    n = y_cai["shape"][0]
+    m = x_cai.shape[0]
+    n = y_cai.shape[0]
 
-    x_k = x_cai["shape"][1]
-    y_k = y_cai["shape"][1]
+    x_dt = x_cai.dtype
+    y_dt = y_cai.dtype
+
+    if out is None:
+        dists = device_ndarray.empty((m, n), dtype=y_dt)
+    else:
+        dists = out
+
+    x_k = x_cai.shape[1]
+    y_k = y_cai.shape[1]
+
+    dists_cai = cai_wrapper(dists)
 
     if x_k != y_k:
         raise ValueError("Inputs must have same number of columns. "
                          "a=%s, b=%s" % (x_k, y_k))
 
-    x_ptr = <uintptr_t>x_cai["data"][0]
-    y_ptr = <uintptr_t>y_cai["data"][0]
-    d_ptr = <uintptr_t>dists_cai["data"][0]
+    x_ptr = <uintptr_t>x_cai.data
+    y_ptr = <uintptr_t>y_cai.data
+    d_ptr = <uintptr_t>dists_cai.data
 
     handle = handle if handle is not None else Handle()
     cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
 
-    x_dt = np.dtype(x_cai["typestr"])
-    y_dt = np.dtype(y_cai["typestr"])
-    d_dt = np.dtype(dists_cai["typestr"])
+    d_dt = dists_cai.dtype
 
-    x_c_contiguous = is_c_cont(x_cai, x_dt)
-    y_c_contiguous = is_c_cont(y_cai, y_dt)
+    x_c_contiguous = x_cai.c_contiguous
+    y_c_contiguous = y_cai.c_contiguous
 
     if x_c_contiguous != y_c_contiguous:
         raise ValueError("Inputs must have matching strides")
@@ -205,3 +247,5 @@ def distance(X, Y, dists, metric="euclidean", p=2.0, handle=None):
                           <float>p)
     else:
         raise ValueError("dtype %s not supported" % x_dt)
+
+    return dists
diff --git a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
new file mode 100644
index 0000000000..79d0470e9a
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Set the list of Cython files to build
+set(linked_libraries raft::raft raft::distance)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES ""
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_
+)
+
+add_subdirectory(ivf_pq)
diff --git a/python/pylibraft/pylibraft/testing/__init__.py b/python/pylibraft/pylibraft/neighbors/__init__.pxd
similarity index 100%
rename from python/pylibraft/pylibraft/testing/__init__.py
rename to python/pylibraft/pylibraft/neighbors/__init__.pxd
diff --git a/python/pylibraft/pylibraft/neighbors/__init__.py b/python/pylibraft/pylibraft/neighbors/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
new file mode 100644
index 0000000000..cfce37b560
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
@@ -0,0 +1,24 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Set the list of Cython files to build
+set(cython_sources ivf_pq.pyx)
+set(linked_libraries raft::raft raft::distance)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_ivfpq_
+)
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py b/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py
new file mode 100644
index 0000000000..8a231b2c8c
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .ivf_pq import Index, IndexParams, SearchParams, build, extend, search
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/c_ivf_pq.pxd
new file mode 100644
index 0000000000..9728495bf8
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/c_ivf_pq.pxd
@@ -0,0 +1,177 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+import pylibraft.common.handle
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport (
+    int8_t,
+    int64_t,
+    uint8_t,
+    uint32_t,
+    uint64_t,
+    uintptr_t,
+)
+from libcpp cimport bool, nullptr
+
+from rmm._lib.memory_resource cimport device_memory_resource
+
+from pylibraft.common.handle cimport handle_t
+from pylibraft.distance.distance_type cimport DistanceType
+
+
+cdef extern from "library_types.h":
+    ctypedef enum cudaDataType_t:
+        CUDA_R_32F "CUDA_R_32F"  # float
+        CUDA_R_16F "CUDA_R_16F"  # half
+
+        # uint8 - used to refer to IVF-PQ's fp8 storage type
+        CUDA_R_8U "CUDA_R_8U"
+
+cdef extern from "raft/neighbors/ann_types.hpp" \
+        namespace "raft::neighbors::ann" nogil:
+
+    cdef cppclass ann_index "raft::neighbors::index":
+        pass
+
+    cdef cppclass ann_index_params "raft::spatial::knn::index_params":
+        DistanceType metric
+        float metric_arg
+        bool add_data_on_build
+
+    cdef cppclass ann_search_params "raft::spatial::knn::search_params":
+        pass
+
+
+cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
+        namespace "raft::neighbors::ivf_pq" nogil:
+
+    ctypedef enum codebook_gen:
+        PER_SUBSPACE "raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE",
+        PER_CLUSTER "raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER"
+
+    cpdef cppclass index_params(ann_index_params):
+        uint32_t n_lists
+        uint32_t kmeans_n_iters
+        double kmeans_trainset_fraction
+        uint32_t pq_bits
+        uint32_t pq_dim
+        codebook_gen codebook_kind
+        bool force_random_rotation
+
+    cdef cppclass index[IdxT](ann_index):
+        index(const handle_t& handle,
+              DistanceType metric,
+              codebook_gen codebook_kind,
+              uint32_t n_lists,
+              uint32_t dim,
+              uint32_t pq_bits,
+              uint32_t pq_dim,
+              uint32_t n_nonempty_lists)
+
+        IdxT size()
+        uint32_t dim()
+        uint32_t pq_dim()
+        uint32_t pq_len()
+        uint32_t pq_bits()
+        DistanceType metric()
+        uint32_t n_lists()
+        uint32_t rot_dim()
+        codebook_gen codebook_kind()
+
+    cpdef cppclass search_params(ann_search_params):
+        uint32_t n_probes
+        cudaDataType_t lut_dtype
+        cudaDataType_t internal_distance_dtype
+
+
+cdef extern from "raft/neighbors/specializations/ivf_pq_specialization.hpp" \
+        namespace "raft::neighbors::ivf_pq" nogil:
+
+    cdef void build(const handle_t& handle,
+                    const index_params& params,
+                    const float* dataset,
+                    uint64_t n_rows,
+                    uint32_t dim,
+                    index[uint64_t]* index) except +
+
+    cdef void build(const handle_t& handle,
+                    const index_params& params,
+                    const int8_t* dataset,
+                    uint64_t n_rows,
+                    uint32_t dim,
+                    index[uint64_t]* index) except +
+
+    cdef void build(const handle_t& handle,
+                    const index_params& params,
+                    const uint8_t* dataset,
+                    uint64_t n_rows,
+                    uint32_t dim,
+                    index[uint64_t]* index) except +
+
+    cdef void extend(const handle_t& handle,
+                     index[uint64_t]* index,
+                     const float* new_vectors,
+                     const uint64_t* new_indices,
+                     uint64_t n_rows) except +
+
+    cdef void extend(const handle_t& handle,
+                     index[uint64_t]* index,
+                     const int8_t* new_vectors,
+                     const uint64_t* new_indices,
+                     uint64_t n_rows) except +
+
+    cdef void extend(const handle_t& handle,
+                     index[uint64_t]* index,
+                     const uint8_t* new_vectors,
+                     const uint64_t* new_indices,
+                     uint64_t n_rows) except +
+
+    cdef void search(const handle_t& handle,
+                     const search_params& params,
+                     const index[uint64_t]& index,
+                     const float* queries,
+                     uint32_t n_queries,
+                     uint32_t k,
+                     uint64_t* neighbors,
+                     float* distances,
+                     device_memory_resource* mr) except +
+
+    cdef void search(const handle_t& handle,
+                     const search_params& params,
+                     const index[uint64_t]& index,
+                     const int8_t* queries,
+                     uint32_t n_queries,
+                     uint32_t k,
+                     uint64_t* neighbors,
+                     float* distances,
+                     device_memory_resource* mr) except +
+
+    cdef void search(const handle_t& handle,
+                     const search_params& params,
+                     const index[uint64_t]& index,
+                     const uint8_t* queries,
+                     uint32_t n_queries,
+                     uint32_t k,
+                     uint64_t* neighbors,
+                     float* distances,
+                     device_memory_resource* mr) except +
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
new file mode 100644
index 0000000000..75b7cd3abb
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -0,0 +1,727 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport (
+    int8_t,
+    int64_t,
+    uint8_t,
+    uint32_t,
+    uint64_t,
+    uintptr_t,
+)
+from libcpp cimport bool, nullptr
+
+from pylibraft.distance.distance_type cimport DistanceType
+
+from pylibraft.common import Handle, cai_wrapper, device_ndarray
+from pylibraft.common.interruptible import cuda_interruptible
+
+from pylibraft.common.handle cimport handle_t
+
+from pylibraft.common.handle import auto_sync_handle
+from pylibraft.common.input_validation import is_c_contiguous
+
+from rmm._lib.memory_resource cimport (
+    DeviceMemoryResource,
+    device_memory_resource,
+)
+
+cimport pylibraft.neighbors.ivf_pq.c_ivf_pq as c_ivf_pq
+from pylibraft.neighbors.ivf_pq.c_ivf_pq cimport index_params, search_params
+
+
+def _get_metric(metric):
+    SUPPORTED_DISTANCES = {
+        "l2_expanded": DistanceType.L2Expanded,
+        # TODO(tfeher): fix inconsistency: index building for L2SqrtExpanded is
+        # only supported by build, not by search.
+        # "euclidean": DistanceType.L2SqrtExpanded
+        "inner_product": DistanceType.InnerProduct
+    }
+    if metric not in SUPPORTED_DISTANCES:
+        raise ValueError("metric %s is not supported" % metric)
+    return SUPPORTED_DISTANCES[metric]
+
+
+cdef _get_metric_string(DistanceType metric):
+    return {DistanceType.L2Expanded : "l2_expanded",
+            DistanceType.InnerProduct: "inner_product"}[metric]
+
+
+cdef _get_codebook_string(c_ivf_pq.codebook_gen codebook):
+    return {c_ivf_pq.codebook_gen.PER_SUBSPACE: "subspace",
+            c_ivf_pq.codebook_gen.PER_CLUSTER: "cluster"}[codebook]
+
+
+cdef _map_dtype_np_to_cuda(dtype, supported_dtypes=None):
+    if supported_dtypes is not None and dtype not in supported_dtypes:
+        raise TypeError("Type %s is not supported" % str(dtype))
+    return {np.float32: c_ivf_pq.cudaDataType_t.CUDA_R_32F,
+            np.float16: c_ivf_pq.cudaDataType_t.CUDA_R_16F,
+            np.uint8: c_ivf_pq.cudaDataType_t.CUDA_R_8U}[dtype]
+
+
+cdef _get_dtype_string(dtype):
+    return str({c_ivf_pq.cudaDataType_t.CUDA_R_32F: np.float32,
+                c_ivf_pq.cudaDataType_t.CUDA_R_16F: np.float16,
+                c_ivf_pq.cudaDataType_t.CUDA_R_8U: np.uint8}[dtype])
+
+
+def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None):
+    if cai.dtype not in exp_dt:
+        raise TypeError("dtype %s not supported" % cai["typestr"])
+
+    if not cai.c_contiguous:
+        raise ValueError("Row major input is expected")
+
+    if exp_cols is not None and cai.shape[1] != exp_cols:
+        raise ValueError("Incorrect number of columns, expected {} got {}"
+                         .format(exp_cols, cai.shape[1]))
+
+    if exp_rows is not None and cai.shape[0] != exp_rows:
+        raise ValueError("Incorrect number of rows, expected {} , got {}"
+                         .format(exp_rows, cai.shape[0]))
+
+
+cdef class IndexParams:
+    cdef c_ivf_pq.index_params params
+
+    def __init__(self, *,
+                 n_lists=1024,
+                 metric="l2_expanded",
+                 kmeans_n_iters=20,
+                 kmeans_trainset_fraction=0.5,
+                 pq_bits=8,
+                 pq_dim=0,
+                 codebook_kind="subspace",
+                 force_random_rotation=False,
+                 add_data_on_build=True):
+        """"
+        Parameters to build index for IVF-PQ nearest neighbor search
+
+        Parameters
+        ----------
+        n_list : int, default = 1024
+            The number of clusters used in the coarse quantizer.
+        metric : string denoting the metric type, default="l2_expanded"
+            Valid values for metric: ["l2_expanded", "inner_product"], where
+            - l2_expanded is the equclidean distance without the square root
+              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - inner product distance is defined as
+              distance(a, b) = \\sum_i a_i * b_i.
+        kmeans_n_iters : int, default = 20
+            The number of iterations searching for kmeans centers during index
+            building.
+        kmeans_trainset_fraction : int, default = 0.5
+            If kmeans_trainset_fraction is less than 1, then the dataset is
+            subsampled, and only n_samples * kmeans_trainset_fraction rows
+            are used for training.
+        pq_bits : int, default = 8
+            The bit length of the vector element after quantization.
+        pq_dim : int, default = 0
+            The dimensionality of a the vector after product quantization.
+            When zero, an optimal value is selected using a heuristic. Note
+            pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim'
+            results in a smaller index size and better search performance, but
+            lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number,
+            but multiple of 8 are desirable for good performance. If 'pq_bits'
+            is not 8, 'pq_dim' should be a multiple of 8. For good performance,
+            it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+            'pq_dim' should be also a divisor of the dataset dim.
+        codebook_kind : string, default = "subspace"
+            Valid values ["subspace", "cluster"]
+        force_random_rotation : bool, default = False
+            Apply a random rotation matrix on the input data and queries even
+            if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`,
+            a random rotation is always applied to the input data and queries
+            to transform the working space from `dim` to `rot_dim`, which may
+            be slightly larger than the original space and and is a multiple
+            of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is
+            not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`,
+            hence no need in adding "extra" data columns / features). By
+            default, if `dim == rot_dim`, the rotation transform is
+            initialized with the identity matrix. When
+            `force_random_rotation == True`, a random orthogonal transform
+            matrix is generated regardless of the values of `dim` and `pq_dim`.
+        add_data_on_build : bool, default = True
+            After training the coarse and fine quantizers, we will populate
+            the index with the dataset if add_data_on_build == True, otherwise
+            the index is left empty, and the extend method can be used
+            to add new vectors to the index.
+
+        """
+        self.params.n_lists = n_lists
+        self.params.metric = _get_metric(metric)
+        self.params.metric_arg = 0
+        self.params.kmeans_n_iters = kmeans_n_iters
+        self.params.kmeans_trainset_fraction = kmeans_trainset_fraction
+        self.params.pq_bits = pq_bits
+        self.params.pq_dim = pq_dim
+        if codebook_kind == "subspace":
+            self.params.codebook_kind = c_ivf_pq.codebook_gen.PER_SUBSPACE
+        elif codebook_kind == "cluster":
+            self.params.codebook_kind = c_ivf_pq.codebook_gen.PER_CLUSTER
+        else:
+            raise ValueError("Incorrect codebook kind %s" % codebook_kind)
+        self.params.force_random_rotation = force_random_rotation
+        self.params.add_data_on_build = add_data_on_build
+
+    @property
+    def n_lists(self):
+        return self.params.n_lists
+
+    @property
+    def metric(self):
+        return self.params.metric
+
+    @property
+    def kmeans_n_iters(self):
+        return self.params.kmeans_n_iters
+
+    @property
+    def kmeans_trainset_fraction(self):
+        return self.params.kmeans_trainset_fraction
+
+    @property
+    def pq_bits(self):
+        return self.params.pq_bits
+
+    @property
+    def pq_dim(self):
+        return self.params.pq_dim
+
+    @property
+    def codebook_kind(self):
+        return self.params.codebook_kind
+
+    @property
+    def force_random_rotation(self):
+        return self.params.force_random_rotation
+
+    @property
+    def add_data_on_build(self):
+        return self.params.add_data_on_build
+
+
+cdef class Index:
+    # We store a pointer to the index because it dose not have a trivial
+    # constructor.
+    cdef c_ivf_pq.index[uint64_t] * index
+    cdef readonly bool trained
+
+    def __cinit__(self, handle=None):
+        self.trained = False
+        self.index = NULL
+        if handle is None:
+            handle = Handle()
+        cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+
+        # We create a placeholder object. The actual parameter values do
+        # not matter, it will be replaced with a built index object later.
+        self.index = new c_ivf_pq.index[uint64_t](
+            deref(handle_), _get_metric("l2_expanded"),
+            c_ivf_pq.codebook_gen.PER_SUBSPACE,
+            <uint32_t>1,
+            <uint32_t>4,
+            <uint32_t>8,
+            <uint32_t>0,
+            <uint32_t>0)
+
+    def __dealloc__(self):
+        if self.index is not NULL:
+            del self.index
+
+    def __repr__(self):
+        m_str = "metric=" + _get_metric_string(self.index.metric())
+        code_str = "codebook=" + _get_codebook_string(
+            self.index.codebook_kind())
+        attr_str = [attr + "=" + str(getattr(self, attr))
+                    for attr in ["size", "dim", "pq_dim", "pq_bits",
+                                 "n_lists", "rot_dim"]]
+        attr_str = [m_str, code_str] + attr_str
+        return "Index(type=IVF-PQ, " + (", ".join(attr_str)) + ")"
+
+    @property
+    def dim(self):
+        return self.index[0].dim()
+
+    @property
+    def size(self):
+        return self.index[0].size()
+
+    @property
+    def pq_dim(self):
+        return self.index[0].pq_dim()
+
+    @property
+    def pq_len(self):
+        return self.index[0].pq_len()
+
+    @property
+    def pq_bits(self):
+        return self.index[0].pq_bits()
+
+    @property
+    def metric(self):
+        return self.index[0].metric()
+
+    @property
+    def n_lists(self):
+        return self.index[0].n_lists()
+
+    @property
+    def rot_dim(self):
+        return self.index[0].rot_dim()
+
+    @property
+    def codebook_kind(self):
+        return self.index[0].codebook_kind()
+
+
+@auto_sync_handle
+def build(IndexParams index_params, dataset, handle=None):
+    """
+    Builds an IVF-PQ index that can be later used for nearest neighbor search.
+
+    Parameters
+    ----------
+    index_params : IndexParams object
+    dataset : CUDA array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float, int8, uint8]
+    {handle_docstring}
+
+    Returns
+    -------
+    index: ivf_pq.Index
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.common import Handle
+        from pylibraft.neighbors import ivf_pq
+
+        n_samples = 50000
+        n_features = 50
+        n_queries = 1000
+
+        dataset = cp.random.random_sample((n_samples, n_features),
+            dtype=cp.float32)
+        handle = Handle()
+        index_params = ivf_pq.IndexParams(
+            n_lists=1024,
+            metric="l2_expanded",
+            pq_dim=10)
+        index = ivf_pq.build(index_params, dataset, handle=handle)
+
+        # Search using the built index
+        queries = cp.random.random_sample((n_queries, n_features),
+                                          dtype=cp.float32)
+        k = 10
+        distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
+                                             queries, k, handle=handle)
+
+        distances = cp.asarray(distances)
+        neighbors = cp.asarray(neighbors)
+
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
+
+    """
+    dataset_cai = cai_wrapper(dataset)
+    dataset_dt = dataset_cai.dtype
+    _check_input_array(dataset_cai, [np.dtype('float32'), np.dtype('byte'),
+                                     np.dtype('ubyte')])
+    cdef uintptr_t dataset_ptr = dataset_cai.data
+
+    cdef uint64_t n_rows = dataset_cai.shape[0]
+    cdef uint32_t dim = dataset_cai.shape[1]
+
+    if handle is None:
+        handle = Handle()
+    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+
+    idx = Index()
+
+    if dataset_dt == np.float32:
+        with cuda_interruptible():
+            c_ivf_pq.build(deref(handle_),
+                           index_params.params,
+                           <float*> dataset_ptr,
+                           n_rows,
+                           dim,
+                           idx.index)
+        idx.trained = True
+    elif dataset_dt == np.byte:
+        with cuda_interruptible():
+            c_ivf_pq.build(deref(handle_),
+                           index_params.params,
+                           <int8_t*> dataset_ptr,
+                           n_rows,
+                           dim,
+                           idx.index)
+        idx.trained = True
+    elif dataset_dt == np.ubyte:
+        with cuda_interruptible():
+            c_ivf_pq.build(deref(handle_),
+                           index_params.params,
+                           <uint8_t*> dataset_ptr,
+                           n_rows,
+                           dim,
+                           idx.index)
+        idx.trained = True
+    else:
+        raise TypeError("dtype %s not supported" % dataset_dt)
+
+    return idx
+
+
+@auto_sync_handle
+def extend(Index index, new_vectors, new_indices, handle=None):
+    """
+    Extend an existing index with new vectors.
+
+
+    Parameters
+    ----------
+    index : ivf_pq.Index
+        Trained ivf_pq object.
+    new_vectors : CUDA array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float, int8, uint8]
+    new_indices : CUDA array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [uint64]
+    {handle_docstring}
+
+    Returns
+    -------
+    index: ivf_pq.Index
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.common import Handle
+        from pylibraft.neighbors import ivf_pq
+
+        n_samples = 50000
+        n_features = 50
+        n_queries = 1000
+
+        dataset = cp.random.random_sample((n_samples, n_features),
+                                          dtype=cp.float32)
+        handle = Handle()
+        index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
+
+        n_rows = 100
+        more_data = cp.random.random_sample((n_rows, n_features),
+                                            dtype=cp.float32)
+        indices = index.size + cp.arange(n_rows, dtype=cp.uint64)
+        index = ivf_pq.extend(index, more_data, indices)
+
+        # Search using the built index
+        queries = cp.random.random_sample((n_queries, n_features),
+                                          dtype=cp.float32)
+        k = 10
+        distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(),
+                                             index, queries,
+                                             k, handle=handle)
+
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
+
+        distances = cp.asarray(distances)
+        neighbors = cp.asarray(neighbors)
+    """
+    if not index.trained:
+        raise ValueError("Index need to be built before calling extend.")
+
+    if handle is None:
+        handle = Handle()
+    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+
+    vecs_cai = cai_wrapper(new_vectors)
+    vecs_dt = vecs_cai.dtype
+    cdef uint64_t n_rows = vecs_cai.shape[0]
+    cdef uint32_t dim = vecs_cai.shape[1]
+
+    _check_input_array(vecs_cai, [np.dtype('float32'), np.dtype('byte'),
+                                  np.dtype('ubyte')],
+                       exp_cols=index.dim)
+
+    idx_cai = cai_wrapper(new_indices)
+    _check_input_array(idx_cai, [np.dtype('uint64')], exp_rows=n_rows)
+    if len(idx_cai.shape)!=1:
+        raise ValueError("Indices array is expected to be 1D")
+
+    cdef uintptr_t vecs_ptr = vecs_cai.data
+    cdef uintptr_t idx_ptr = idx_cai.data
+
+    if vecs_dt == np.float32:
+        with cuda_interruptible():
+            c_ivf_pq.extend(deref(handle_),
+                            index.index,
+                            <float*>vecs_ptr,
+                            <uint64_t*> idx_ptr,
+                            <uint64_t> n_rows)
+    elif vecs_dt == np.int8:
+        with cuda_interruptible():
+            c_ivf_pq.extend(deref(handle_),
+                            index.index,
+                            <int8_t*>vecs_ptr,
+                            <uint64_t*> idx_ptr,
+                            <uint64_t> n_rows)
+    elif vecs_dt == np.uint8:
+        with cuda_interruptible():
+            c_ivf_pq.extend(deref(handle_),
+                            index.index,
+                            <uint8_t*>vecs_ptr,
+                            <uint64_t*> idx_ptr,
+                            <uint64_t> n_rows)
+    else:
+        raise TypeError("query dtype %s not supported" % vecs_dt)
+
+    return index
+
+
+cdef class SearchParams:
+    cdef c_ivf_pq.search_params params
+
+    def __init__(self, *, n_probes=20,
+                 lut_dtype=np.float32,
+                 internal_distance_dtype=np.float32):
+        """
+        IVF-PQ search parameters
+
+        Parameters
+        ----------
+        n_probes: int, default = 1024
+            The number of course clusters to select for the fine search.
+        lut_dtype: default = np.float32
+            Data type of look up table to be created dynamically at search
+            time. The use of low-precision types reduces the amount of shared
+            memory required at search time, so fast shared memory kernels can
+            be used even for datasets with large dimansionality. Note that
+            the recall is slightly degraded when low-precision type is
+            selected. Possible values [np.float32, np.float16, np.uint8]
+        internal_distance_dtype: default = np.float32
+            Storage data type for distance/similarity computation.
+            Possible values [np.float32, np.float16]
+        """
+
+        self.params.n_probes = n_probes
+        self.params.lut_dtype = _map_dtype_np_to_cuda(lut_dtype)
+        self.params.internal_distance_dtype = \
+            _map_dtype_np_to_cuda(internal_distance_dtype)
+        # TODO(tfeher): enable if #926 adds this
+        # self.params.shmem_carveout = self.shmem_carveout
+
+    def __repr__(self):
+        lut_str = "lut_dtype=" + _get_dtype_string(self.params.lut_dtype)
+        idt_str = "internal_distance_dtype=" + \
+            _get_dtype_string(self.params.internal_distance_dtype)
+        attr_str = [attr + "=" + str(getattr(self, attr))
+                    for attr in ["n_probes"]]
+        # TODO (tfeher) add "shmem_carveout"
+        attr_str = attr_str + [lut_str, idt_str]
+        return "SearchParams(type=IVF-PQ, " + (", ".join(attr_str)) + ")"
+
+    @property
+    def n_probes(self):
+        return self.params.n_probes
+
+    @property
+    def lut_dtype(self):
+        return self.params.lut_dtype
+
+    @property
+    def internal_distance_dtype(self):
+        return self.params.internal_distance_dtype
+
+
+@auto_sync_handle
+def search(SearchParams search_params,
+           Index index,
+           queries,
+           k,
+           neighbors=None,
+           distances=None,
+           DeviceMemoryResource memory_resource=None,
+           handle=None):
+    """
+    Find the k nearest neighbors for each query.
+
+    Parameters
+    ----------
+    search_params : SearchParams
+    index : Index
+        Trained IVF-PQ index.
+    queries : CUDA array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float, int8, uint8]
+    k : int
+        The number of neighbors.
+    neighbors : Optional CUDA array interface compliant matrix shape
+                (n_queries, k), dtype uint64_t. If supplied, neighbor
+                indices will be written here in-place. (default None)
+    distances : Optional CUDA array interface compliant matrix shape
+                (n_queries, k) If supplied, the distances to the
+                neighbors will be written here in-place. (default None)
+    memory_resource : RMM DeviceMemoryResource object, optional
+        This can be used to explicitly manage the temporary memory
+        allocation during search. Passing a pooling allocator can reduce
+        memory allocation overhead. If not specified, then the memory
+        resource from the raft handle is used.
+    {handle_docstring}
+
+    Examples
+    --------
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.common import Handle
+        from pylibraft.neighbors import ivf_pq
+
+        n_samples = 50000
+        n_features = 50
+        n_queries = 1000
+        dataset = cp.random.random_sample((n_samples, n_features),
+                                          dtype=cp.float32)
+
+        # Build index
+        handle = Handle()
+        index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
+
+        # Search using the built index
+        queries = cp.random.random_sample((n_queries, n_features),
+                                          dtype=cp.float32)
+        k = 10
+        search_params = ivf_pq.SearchParams(
+            n_probes=20,
+            lut_dtype=ivf_pq.np.float16,
+            internal_distance_dtype=ivf_pq.np.float32
+        )
+
+        # Using a pooling allocator reduces overhead of temporary array
+        # creation during search. This is useful if multiple searches
+        # are performad with same query size.
+        mr = rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(),
+            initial_pool_size=2**29,
+            maximum_pool_size=2**31
+        )
+        distances, neighbors = ivf_pq.search(search_params, index, queries,
+                                             k, memory_resource=mr,
+                                             handle=handle)
+
+        # pylibraft functions are often asynchronous so the
+        # handle needs to be explicitly synchronized
+        handle.sync()
+
+        neighbors = cp.asarray(neighbors)
+        distances = cp.asarray(distances)
+    """
+
+    if not index.trained:
+        raise ValueError("Index need to be built before calling search.")
+
+    if handle is None:
+        handle = Handle()
+    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+
+    queries_cai = cai_wrapper(queries)
+    queries_dt = queries_cai.dtype
+    cdef uint32_t n_queries = queries_cai.shape[0]
+
+    _check_input_array(queries_cai, [np.dtype('float32'), np.dtype('byte'),
+                                     np.dtype('ubyte')],
+                       exp_cols=index.dim)
+
+    if neighbors is None:
+        neighbors = device_ndarray.empty((n_queries, k), dtype='uint64')
+
+    neighbors_cai = cai_wrapper(neighbors)
+    _check_input_array(neighbors_cai, [np.dtype('uint64')],
+                       exp_rows=n_queries, exp_cols=k)
+
+    if distances is None:
+        distances = device_ndarray.empty((n_queries, k), dtype='float32')
+
+    distances_cai = cai_wrapper(distances)
+    _check_input_array(distances_cai, [np.dtype('float32')],
+                       exp_rows=n_queries, exp_cols=k)
+
+    cdef c_ivf_pq.search_params params = search_params.params
+
+    cdef uintptr_t queries_ptr = queries_cai.data
+    cdef uintptr_t neighbors_ptr = neighbors_cai.data
+    cdef uintptr_t distances_ptr = distances_cai.data
+    # TODO(tfeher) pass mr_ptr arg
+    cdef device_memory_resource* mr_ptr = <device_memory_resource*> nullptr
+    if memory_resource is not None:
+        mr_ptr = memory_resource.get_mr()
+
+    if queries_dt == np.float32:
+        with cuda_interruptible():
+            c_ivf_pq.search(deref(handle_),
+                            params,
+                            deref(index.index),
+                            <float*>queries_ptr,
+                            <uint32_t> n_queries,
+                            <uint32_t> k,
+                            <uint64_t*> neighbors_ptr,
+                            <float*> distances_ptr,
+                            mr_ptr)
+    elif queries_dt == np.byte:
+        with cuda_interruptible():
+            c_ivf_pq.search(deref(handle_),
+                            params,
+                            deref(index.index),
+                            <int8_t*>queries_ptr,
+                            <uint32_t> n_queries,
+                            <uint32_t> k,
+                            <uint64_t*> neighbors_ptr,
+                            <float*> distances_ptr,
+                            mr_ptr)
+    elif queries_dt == np.ubyte:
+        with cuda_interruptible():
+            c_ivf_pq.search(deref(handle_),
+                            params,
+                            deref(index.index),
+                            <uint8_t*>queries_ptr,
+                            <uint32_t> n_queries,
+                            <uint32_t> k,
+                            <uint64_t*> neighbors_ptr,
+                            <float*> distances_ptr,
+                            mr_ptr)
+    else:
+        raise ValueError("query dtype %s not supported" % queries_dt)
+
+    return (distances, neighbors)
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index c44b0cea67..49ca8627cc 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -14,14 +14,14 @@
 
 # Set the list of Cython files to build
 set(cython_sources rmat_rectangular_generator.pyx)
-# TODO: should finally be replaced with 'compiled' library to be more generic,
-# when that is available
+
+# TODO: should finally be replaced with 'compiled' library to be more generic, when that is
+# available
 set(linked_libraries raft::raft raft::distance)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
-    CXX
-    SOURCE_FILES "${cython_sources}"
-    LINKED_LIBRARIES "${linked_libraries}"
-    ASSOCIATED_TARGETS raft
-    MODULE_PREFIX random_)
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX random_
+)
diff --git a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
index 56359ff8a3..ef785a900b 100644
--- a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
+++ b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
@@ -20,15 +20,18 @@
 
 import numpy as np
 
-from libc.stdint cimport uintptr_t, int64_t
 from cython.operator cimport dereference as deref
-from pylibraft.common import Handle
+from libc.stdint cimport int64_t, uintptr_t
+
+from pylibraft.common import Handle, cai_wrapper
 from pylibraft.common.handle import auto_sync_handle
-from pylibraft.common.handle cimport handle_t
-from .rng_state cimport RngState
 
 from libcpp cimport bool
 
+from pylibraft.common.handle cimport handle_t
+
+from .rng_state cimport RngState
+
 
 cdef extern from "raft_distance/random/rmat_rectangular_generator.hpp" \
         namespace "raft::random::runtime":
@@ -126,14 +129,14 @@ def rmat(out, theta, r_scale, c_scale, seed=12345, handle=None):
     if out is None:
         raise Exception("'out' cannot be None!")
 
-    out_cai = out.__cuda_array_interface__
-    theta_cai = theta.__cuda_array_interface__
+    out_cai = cai_wrapper(out)
+    theta_cai = cai_wrapper(theta)
 
-    n_edges = out_cai["shape"][0]
-    out_ptr = <uintptr_t>out_cai["data"][0]
-    theta_ptr = <uintptr_t>theta_cai["data"][0]
-    out_dt = np.dtype(out_cai["typestr"])
-    theta_dt = np.dtype(theta_cai["typestr"])
+    n_edges = out_cai.shape[0]
+    out_ptr = <uintptr_t>out_cai.data
+    theta_ptr = <uintptr_t>theta_cai.data
+    out_dt = out_cai.dtype
+    theta_dt = theta_cai.dtype
 
     cdef RngState *rng = new RngState(seed)
 
diff --git a/python/pylibraft/pylibraft/random/rng_state.pxd b/python/pylibraft/pylibraft/random/rng_state.pxd
index 9d1e2d17e3..8c89db3618 100644
--- a/python/pylibraft/pylibraft/random/rng_state.pxd
+++ b/python/pylibraft/pylibraft/random/rng_state.pxd
@@ -20,6 +20,7 @@
 
 from libc.stdint cimport uint64_t
 
+
 cdef extern from "raft/random/rng_state.hpp" namespace "raft::random" nogil:
 
     ctypedef enum GeneratorType:
diff --git a/python/pylibraft/pylibraft/test/test_cai_wrapper.py b/python/pylibraft/pylibraft/test/test_cai_wrapper.py
new file mode 100644
index 0000000000..e0c89b0291
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_cai_wrapper.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import cai_wrapper, device_ndarray
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("shape", [(10, 5)])
+def test_basic_accessors(order, dtype, shape):
+
+    a = np.random.random(shape).astype(dtype)
+
+    if order == "C":
+        a = np.ascontiguousarray(a)
+    else:
+        a = np.asfortranarray(a)
+
+    db = device_ndarray(a)
+    cai_wrap = cai_wrapper(db)
+
+    assert cai_wrap.dtype == dtype
+    assert cai_wrap.shape == shape
+    assert cai_wrap.c_contiguous == (order == "C")
+    assert cai_wrap.f_contiguous == (order == "F")
diff --git a/python/pylibraft/pylibraft/test/test_device_ndarray.py b/python/pylibraft/pylibraft/test/test_device_ndarray.py
new file mode 100644
index 0000000000..ee96abe049
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_device_ndarray.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import device_ndarray
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_basic_attributes(order, dtype):
+
+    a = np.random.random((500, 2)).astype(dtype)
+
+    if order == "C":
+        a = np.ascontiguousarray(a)
+    else:
+        a = np.asfortranarray(a)
+
+    db = device_ndarray(a)
+    db_host = db.copy_to_host()
+
+    assert a.shape == db.shape
+    assert a.dtype == db.dtype
+    assert a.data.f_contiguous == db.f_contiguous
+    assert a.data.f_contiguous == db_host.data.f_contiguous
+    assert a.data.c_contiguous == db.c_contiguous
+    assert a.data.c_contiguous == db_host.data.c_contiguous
+    np.testing.assert_array_equal(a.tolist(), db_host.tolist())
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_empty(order, dtype):
+
+    a = np.random.random((500, 2)).astype(dtype)
+    if order == "C":
+        a = np.ascontiguousarray(a)
+    else:
+        a = np.asfortranarray(a)
+
+    db = device_ndarray.empty(a.shape, dtype=dtype, order=order)
+    db_host = db.copy_to_host()
+
+    assert a.shape == db.shape
+    assert a.dtype == db.dtype
+    assert a.data.f_contiguous == db.f_contiguous
+    assert a.data.f_contiguous == db_host.data.f_contiguous
+    assert a.data.c_contiguous == db.c_contiguous
+    assert a.data.c_contiguous == db_host.data.c_contiguous
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index 7f35a25493..a08656d3aa 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -13,25 +13,35 @@
 # limitations under the License.
 #
 
-from scipy.spatial.distance import cdist
-import pytest
 import numpy as np
+import pytest
+from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.distance import pairwise_distance
 
-from pylibraft.testing.utils import TestDeviceBuffer
-
 
 @pytest.mark.parametrize("n_rows", [100])
 @pytest.mark.parametrize("n_cols", [100])
-@pytest.mark.parametrize("metric", ["euclidean", "cityblock", "chebyshev",
-                                    "canberra", "correlation", "hamming",
-                                    "jensenshannon", "russellrao", "cosine",
-                                    "sqeuclidean"])
+@pytest.mark.parametrize(
+    "metric",
+    [
+        "euclidean",
+        "cityblock",
+        "chebyshev",
+        "canberra",
+        "correlation",
+        "hamming",
+        "jensenshannon",
+        "russellrao",
+        "cosine",
+        "sqeuclidean",
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("order", ["F", "C"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_distance(n_rows, n_cols, metric, order, dtype):
+def test_distance(n_rows, n_cols, inplace, metric, order, dtype):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1, order=order).astype(dtype)
 
@@ -51,13 +61,17 @@ def test_distance(n_rows, n_cols, metric, order, dtype):
 
     expected[expected <= 1e-5] = 0.0
 
-    input1_device = TestDeviceBuffer(input1, order)
-    output_device = TestDeviceBuffer(output, order)
+    input1_device = device_ndarray(input1)
+    output_device = device_ndarray(output) if inplace else None
 
     handle = Handle()
-    pairwise_distance(input1_device, input1_device, output_device, metric)
+    ret_output = pairwise_distance(
+        input1_device, input1_device, output_device, metric
+    )
     handle.sync()
 
+    output_device = ret_output if not inplace else output_device
+
     actual = output_device.copy_to_host()
 
     actual[actual <= 1e-5] = 0.0
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
index 1ce1ee2d1f..b05ad3d530 100644
--- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -13,20 +13,20 @@
 # limitations under the License.
 #
 
-from scipy.spatial.distance import cdist
-import pytest
 import numpy as np
+import pytest
+from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.distance import fused_l2_nn_argmin
-from pylibraft.testing.utils import TestDeviceBuffer
 
 
+@pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("n_rows", [10, 100])
 @pytest.mark.parametrize("n_clusters", [5, 10])
 @pytest.mark.parametrize("n_cols", [3, 5])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
+def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype, inplace):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1, order="C").astype(dtype)
 
@@ -38,14 +38,16 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype):
 
     expected = expected.argmin(axis=1)
 
-    input1_device = TestDeviceBuffer(input1, "C")
-    input2_device = TestDeviceBuffer(input2, "C")
-    output_device = TestDeviceBuffer(output, "C")
+    input1_device = device_ndarray(input1)
+    input2_device = device_ndarray(input2)
+    output_device = device_ndarray(output) if inplace else None
 
     handle = Handle()
-    fused_l2_nn_argmin(input1_device, input2_device, output_device,
-                       True, handle=handle)
+    ret_output = fused_l2_nn_argmin(
+        input1_device, input2_device, output_device, True, handle=handle
+    )
     handle.sync()
+    output_device = ret_output if not inplace else output_device
     actual = output_device.copy_to_host()
 
     assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py
new file mode 100644
index 0000000000..78a02ad77e
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py
@@ -0,0 +1,482 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     h ttp://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from sklearn.metrics import pairwise_distances
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import normalize
+
+from pylibraft.common import device_ndarray
+from pylibraft.neighbors import ivf_pq
+
+
+def generate_data(shape, dtype):
+    if dtype == np.byte:
+        x = np.random.randint(-127, 128, size=shape, dtype=np.byte)
+    elif dtype == np.ubyte:
+        x = np.random.randint(0, 255, size=shape, dtype=np.ubyte)
+    else:
+        x = np.random.random_sample(shape).astype(dtype)
+
+    return x
+
+
+def calc_recall(ann_idx, true_nn_idx):
+    assert ann_idx.shape == true_nn_idx.shape
+    n = 0
+    for i in range(ann_idx.shape[0]):
+        n += np.intersect1d(ann_idx[i, :], true_nn_idx[i, :]).size
+    recall = n / ann_idx.size
+    return recall
+
+
+def check_distances(dataset, queries, metric, out_idx, out_dist):
+    """
+    Calculate the real distance between queries and dataset[out_idx],
+    and compare it to out_dist.
+    """
+    dist = np.empty(out_dist.shape, out_dist.dtype)
+    for i in range(queries.shape[0]):
+        X = queries[np.newaxis, i, :]
+        Y = dataset[out_idx[i, :], :]
+        if metric == "l2_expanded":
+            dist[i, :] = pairwise_distances(X, Y, "euclidean")
+        elif metric == "inner_product":
+            dist[i, :] = np.matmul(X, Y.T)
+        else:
+            raise ValueError("Invali metric")
+
+    # Note: raft l2 metric does not include the square root operation like
+    # sklearn's euclidean.
+    if metric == "l2_expanded":
+        dist = np.power(dist, 2)
+
+    dist_eps = abs(dist)
+    dist_eps[dist < 1e-3] = 1e-3
+    diff = abs(out_dist - dist) / dist_eps
+
+    # Quantization leads to errors in the distance calculation.
+    # The aim of this test is not to test precision, but to catch obvious
+    # errors.
+    assert np.mean(diff) < 0.1
+
+
+def run_ivf_pq_build_search_test(
+    n_rows,
+    n_cols,
+    n_queries,
+    k,
+    n_lists,
+    metric,
+    dtype,
+    pq_bits=8,
+    pq_dim=0,
+    codebook_kind="subspace",
+    add_data_on_build="True",
+    n_probes=100,
+    lut_dtype=np.float32,
+    internal_distance_dtype=np.float32,
+    force_random_rotation=False,
+    kmeans_trainset_fraction=1,
+    kmeans_n_iters=20,
+    compare=True,
+    inplace=True,
+):
+    dataset = generate_data((n_rows, n_cols), dtype)
+    if metric == "inner_product":
+        dataset = normalize(dataset, norm="l2", axis=1)
+    dataset_device = device_ndarray(dataset)
+
+    build_params = ivf_pq.IndexParams(
+        n_lists=n_lists,
+        metric=metric,
+        kmeans_n_iters=kmeans_n_iters,
+        kmeans_trainset_fraction=kmeans_trainset_fraction,
+        pq_bits=pq_bits,
+        pq_dim=pq_dim,
+        codebook_kind=codebook_kind,
+        force_random_rotation=force_random_rotation,
+        add_data_on_build=add_data_on_build,
+    )
+
+    index = ivf_pq.build(build_params, dataset_device)
+
+    assert index.trained
+    if pq_dim != 0:
+        assert index.pq_dim == build_params.pq_dim
+    assert index.pq_bits == build_params.pq_bits
+    assert index.metric == build_params.metric
+    assert index.n_lists == build_params.n_lists
+
+    if not add_data_on_build:
+        dataset_1_device = device_ndarray(dataset[: n_rows // 2, :])
+        dataset_2_device = device_ndarray(dataset[n_rows // 2 :, :])
+        indices_1 = np.arange(n_rows // 2, dtype=np.uint64)
+        indices_1_device = device_ndarray(indices_1)
+        indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint64)
+        indices_2_device = device_ndarray(indices_2)
+        index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
+        index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
+
+    assert index.size >= n_rows
+
+    queries = generate_data((n_queries, n_cols), dtype)
+    out_idx = np.zeros((n_queries, k), dtype=np.uint64)
+    out_dist = np.zeros((n_queries, k), dtype=np.float32)
+
+    queries_device = device_ndarray(queries)
+    out_idx_device = device_ndarray(out_idx) if inplace else None
+    out_dist_device = device_ndarray(out_dist) if inplace else None
+
+    search_params = ivf_pq.SearchParams(
+        n_probes=n_probes,
+        lut_dtype=lut_dtype,
+        internal_distance_dtype=internal_distance_dtype,
+    )
+
+    ret_output = ivf_pq.search(
+        search_params,
+        index,
+        queries_device,
+        k,
+        neighbors=out_idx_device,
+        distances=out_dist_device,
+    )
+
+    if not inplace:
+        out_dist_device, out_idx_device = ret_output
+
+    if not compare:
+        return
+
+    out_idx = out_idx_device.copy_to_host()
+    out_dist = out_dist_device.copy_to_host()
+
+    # Calculate reference values with sklearn
+    skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
+        metric
+    ]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
+    nn_skl.fit(dataset)
+    skl_idx = nn_skl.kneighbors(queries, return_distance=False)
+
+    recall = calc_recall(out_idx, skl_idx)
+    assert recall > 0.7
+
+    check_distances(dataset, queries, metric, out_idx, out_dist)
+
+
+@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("n_rows", [10000])
+@pytest.mark.parametrize("n_cols", [10])
+@pytest.mark.parametrize("n_queries", [100])
+@pytest.mark.parametrize("n_lists", [100])
+@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
+def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
+    # Note that inner_product tests use normalized input which we cannot
+    # represent in int8, therefore we test only l2_expanded metric here.
+    run_ivf_pq_build_search_test(
+        n_rows=n_rows,
+        n_cols=n_cols,
+        n_queries=n_queries,
+        k=10,
+        n_lists=n_lists,
+        metric="l2_expanded",
+        dtype=dtype,
+        inplace=inplace,
+    )
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        pytest.param(
+            {
+                "n_rows": 0,
+                "n_cols": 10,
+                "n_queries": 10,
+                "k": 1,
+                "n_lists": 10,
+            },
+            marks=pytest.mark.xfail(reason="empty dataset"),
+        ),
+        {"n_rows": 1, "n_cols": 10, "n_queries": 10, "k": 1, "n_lists": 10},
+        {"n_rows": 10, "n_cols": 1, "n_queries": 10, "k": 10, "n_lists": 10},
+        # {"n_rows": 999, "n_cols": 42, "n_queries": 453, "k": 137,
+        #  "n_lists": 53},
+    ],
+)
+def test_ivf_pq_n(params):
+    # We do not test recall, just confirm that we can handle edge cases for
+    # certain parameters
+    run_ivf_pq_build_search_test(
+        n_rows=params["n_rows"],
+        n_cols=params["n_cols"],
+        n_queries=params["n_queries"],
+        k=params["k"],
+        n_lists=params["n_lists"],
+        metric="l2_expanded",
+        dtype=np.float32,
+        compare=False,
+    )
+
+
+@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
+@pytest.mark.parametrize("dtype", [np.float32])
+@pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"])
+@pytest.mark.parametrize("rotation", [True, False])
+def test_ivf_pq_build_params(metric, dtype, codebook_kind, rotation):
+    run_ivf_pq_build_search_test(
+        n_rows=10000,
+        n_cols=10,
+        n_queries=1000,
+        k=10,
+        n_lists=100,
+        metric=metric,
+        dtype=dtype,
+        pq_bits=8,
+        pq_dim=0,
+        codebook_kind=codebook_kind,
+        add_data_on_build=True,
+        n_probes=100,
+        force_random_rotation=rotation,
+    )
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        {"pq_dims": 10, "pq_bits": 8, "n_lists": 100},
+        {"pq_dims": 16, "pq_bits": 7, "n_lists": 100},
+        {"pq_dims": 0, "pq_bits": 8, "n_lists": 90},
+        {
+            "pq_dims": 0,
+            "pq_bits": 8,
+            "n_lists": 100,
+            "trainset_fraction": 0.9,
+            "n_iters": 30,
+        },
+    ],
+)
+def test_ivf_pq_params(params):
+    run_ivf_pq_build_search_test(
+        n_rows=10000,
+        n_cols=16,
+        n_queries=1000,
+        k=10,
+        n_lists=params["n_lists"],
+        metric="l2_expanded",
+        dtype=np.float32,
+        pq_bits=params["pq_bits"],
+        pq_dim=params["pq_dims"],
+        kmeans_trainset_fraction=params.get("trainset_fraction", 1.0),
+        kmeans_n_iters=params.get("n_iters", 20),
+    )
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        {
+            "k": 10,
+            "n_probes": 100,
+            "lut": np.float16,
+            "idd": np.float32,
+        },
+        {
+            "k": 10,
+            "n_probes": 99,
+            "lut": np.uint8,
+            "idd": np.float32,
+        },
+        {
+            "k": 10,
+            "n_probes": 100,
+            "lut": np.float32,
+            "idd": np.float16,
+        },
+        {
+            "k": 129,
+            "n_probes": 100,
+            "lut": np.float32,
+            "idd": np.float32,
+        },
+    ],
+)
+def test_ivf_pq_search_params(params):
+    run_ivf_pq_build_search_test(
+        n_rows=10000,
+        n_cols=16,
+        n_queries=1000,
+        k=params["k"],
+        n_lists=100,
+        n_probes=params["n_probes"],
+        metric="l2_expanded",
+        dtype=np.float32,
+        lut_dtype=params["lut"],
+        internal_distance_dtype=params["idd"],
+    )
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
+def test_extend(dtype):
+    run_ivf_pq_build_search_test(
+        n_rows=10000,
+        n_cols=10,
+        n_queries=100,
+        k=10,
+        n_lists=100,
+        metric="l2_expanded",
+        dtype=dtype,
+        add_data_on_build=False,
+    )
+
+
+def test_build_assertions():
+    with pytest.raises(TypeError):
+        run_ivf_pq_build_search_test(
+            n_rows=1000,
+            n_cols=10,
+            n_queries=100,
+            k=10,
+            n_lists=100,
+            metric="l2_expanded",
+            dtype=np.float64,
+        )
+
+    n_rows = 1000
+    n_cols = 100
+    n_queries = 212
+    k = 10
+    dataset = generate_data((n_rows, n_cols), np.float32)
+    dataset_device = device_ndarray(dataset)
+
+    index_params = ivf_pq.IndexParams(
+        n_lists=50,
+        metric="l2_expanded",
+        kmeans_n_iters=20,
+        kmeans_trainset_fraction=1,
+        add_data_on_build=False,
+    )
+
+    index = ivf_pq.Index()
+
+    queries = generate_data((n_queries, n_cols), np.float32)
+    out_idx = np.zeros((n_queries, k), dtype=np.uint64)
+    out_dist = np.zeros((n_queries, k), dtype=np.float32)
+
+    queries_device = device_ndarray(queries)
+    out_idx_device = device_ndarray(out_idx)
+    out_dist_device = device_ndarray(out_dist)
+
+    search_params = ivf_pq.SearchParams(n_probes=50)
+
+    with pytest.raises(ValueError):
+        # Index must be built before search
+        ivf_pq.search(
+            search_params,
+            index,
+            queries_device,
+            k,
+            out_idx_device,
+            out_dist_device,
+        )
+
+    index = ivf_pq.build(index_params, dataset_device)
+    assert index.trained
+
+    indices = np.arange(n_rows + 1, dtype=np.uint64)
+    indices_device = device_ndarray(indices)
+
+    with pytest.raises(ValueError):
+        # Dataset dimension mismatch
+        ivf_pq.extend(index, queries_device, indices_device)
+
+    with pytest.raises(ValueError):
+        # indices dimension mismatch
+        ivf_pq.extend(index, dataset_device, indices_device)
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        {"q_dt": np.float64},
+        {"q_order": "F"},
+        {"q_cols": 101},
+        {"idx_dt": np.uint32},
+        {"idx_order": "F"},
+        {"idx_rows": 42},
+        {"idx_cols": 137},
+        {"dist_dt": np.float64},
+        {"dist_order": "F"},
+        {"dist_rows": 42},
+        {"dist_cols": 137},
+    ],
+)
+def test_search_inputs(params):
+    """Test with invalid input dtype, order, or dimension."""
+    n_rows = 1000
+    n_cols = 100
+    n_queries = 256
+    k = 10
+    dtype = np.float32
+
+    q_dt = params.get("q_dt", np.float32)
+    q_order = params.get("q_order", "C")
+    queries = generate_data(
+        (n_queries, params.get("q_cols", n_cols)), q_dt
+    ).astype(q_dt, order=q_order)
+    queries_device = device_ndarray(queries)
+
+    idx_dt = params.get("idx_dt", np.uint64)
+    idx_order = params.get("idx_order", "C")
+    out_idx = np.zeros(
+        (params.get("idx_rows", n_queries), params.get("idx_cols", k)),
+        dtype=idx_dt,
+        order=idx_order,
+    )
+    out_idx_device = device_ndarray(out_idx)
+
+    dist_dt = params.get("dist_dt", np.float32)
+    dist_order = params.get("dist_order", "C")
+    out_dist = np.zeros(
+        (params.get("dist_rows", n_queries), params.get("dist_cols", k)),
+        dtype=dist_dt,
+        order=dist_order,
+    )
+    out_dist_device = device_ndarray(out_dist)
+
+    index_params = ivf_pq.IndexParams(
+        n_lists=50, metric="l2_expanded", add_data_on_build=True
+    )
+
+    dataset = generate_data((n_rows, n_cols), dtype)
+    dataset_device = device_ndarray(dataset)
+    index = ivf_pq.build(index_params, dataset_device)
+    assert index.trained
+
+    with pytest.raises(Exception):
+        search_params = ivf_pq.SearchParams(n_probes=50)
+        ivf_pq.search(
+            search_params,
+            index,
+            queries_device,
+            k,
+            out_idx_device,
+            out_dist_device,
+        )
diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py
index 1d65470a82..44f60be310 100644
--- a/python/pylibraft/pylibraft/test/test_kmeans.py
+++ b/python/pylibraft/pylibraft/test/test_kmeans.py
@@ -13,15 +13,13 @@
 # limitations under the License.
 #
 
-import pytest
 import numpy as np
+import pytest
 
-from pylibraft.common import Handle
-from pylibraft.cluster.kmeans import compute_new_centroids
+from pylibraft.cluster.kmeans import cluster_cost, compute_new_centroids
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.distance import pairwise_distance
 
-from pylibraft.testing.utils import TestDeviceBuffer
-
 
 @pytest.mark.parametrize("n_rows", [100])
 @pytest.mark.parametrize("n_cols", [5, 25])
@@ -29,40 +27,41 @@
 @pytest.mark.parametrize("metric", ["euclidean", "sqeuclidean"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("additional_args", [True, False])
-def test_compute_new_centroids(n_rows, n_cols, metric, n_clusters, dtype,
-                               additional_args):
-
-    order = "C"
+def test_compute_new_centroids(
+    n_rows, n_cols, metric, n_clusters, dtype, additional_args
+):
 
     # A single RAFT handle can optionally be reused across
     # pylibraft functions.
     handle = Handle()
 
     X = np.random.random_sample((n_rows, n_cols)).astype(dtype)
-    X_device = TestDeviceBuffer(X, order)
+    X_device = device_ndarray(X)
 
     centroids = X[:n_clusters]
-    centroids_device = TestDeviceBuffer(centroids, order)
+    centroids_device = device_ndarray(centroids)
 
-    weight_per_cluster = np.zeros((n_clusters, ), dtype=dtype)
-    weight_per_cluster_device = TestDeviceBuffer(weight_per_cluster, order) \
-        if additional_args else None
+    weight_per_cluster = np.zeros((n_clusters,), dtype=dtype)
+    weight_per_cluster_device = (
+        device_ndarray(weight_per_cluster) if additional_args else None
+    )
 
     new_centroids = np.zeros((n_clusters, n_cols), dtype=dtype)
-    new_centroids_device = TestDeviceBuffer(new_centroids, order)
+    new_centroids_device = device_ndarray(new_centroids)
 
     sample_weights = np.ones((n_rows,)).astype(dtype) / n_rows
-    sample_weights_device = TestDeviceBuffer(sample_weights, order) \
-        if additional_args else None
+    sample_weights_device = (
+        device_ndarray(sample_weights) if additional_args else None
+    )
 
     # Compute new centroids naively
     dists = np.zeros((n_rows, n_clusters), dtype=dtype)
-    dists_device = TestDeviceBuffer(dists, order)
+    dists_device = device_ndarray(dists)
     pairwise_distance(X_device, centroids_device, dists_device, metric=metric)
     handle.sync()
 
     labels = np.argmin(dists_device.copy_to_host(), axis=1).astype(np.int32)
-    labels_device = TestDeviceBuffer(labels, order)
+    labels_device = device_ndarray(labels)
 
     expected_centers = np.empty((n_clusters, n_cols), dtype=dtype)
     expected_wX = X * sample_weights.reshape((-1, 1))
@@ -72,13 +71,15 @@ def test_compute_new_centroids(n_rows, n_cols, metric, n_clusters, dtype,
         g = sample_weights[labels == i].sum()
         expected_centers[i, :] = j / g
 
-    compute_new_centroids(X_device,
-                          centroids_device,
-                          labels_device,
-                          new_centroids_device,
-                          sample_weights=sample_weights_device,
-                          weight_per_cluster=weight_per_cluster_device,
-                          handle=handle)
+    compute_new_centroids(
+        X_device,
+        centroids_device,
+        labels_device,
+        new_centroids_device,
+        sample_weights=sample_weights_device,
+        weight_per_cluster=weight_per_cluster_device,
+        handle=handle,
+    )
 
     # pylibraft functions are often asynchronous so the
     # handle needs to be explicitly synchronized
@@ -87,3 +88,31 @@ def test_compute_new_centroids(n_rows, n_cols, metric, n_clusters, dtype,
     actual_centers = new_centroids_device.copy_to_host()
 
     assert np.allclose(expected_centers, actual_centers, rtol=1e-6)
+
+
+@pytest.mark.parametrize("n_rows", [100])
+@pytest.mark.parametrize("n_cols", [5, 25])
+@pytest.mark.parametrize("n_clusters", [4, 15])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_cluster_cost(n_rows, n_cols, n_clusters, dtype):
+    X = np.random.random_sample((n_rows, n_cols)).astype(dtype)
+    X_device = device_ndarray(X)
+
+    centroids = X[:n_clusters]
+    centroids_device = device_ndarray(centroids)
+
+    inertia = cluster_cost(X_device, centroids_device)
+
+    # compute the nearest centroid to each sample
+    distances = pairwise_distance(
+        X_device, centroids_device, metric="sqeuclidean"
+    ).copy_to_host()
+    cluster_ids = np.argmin(distances, axis=1)
+
+    cluster_distances = np.take_along_axis(
+        distances, cluster_ids[:, None], axis=1
+    )
+
+    # need reduced tolerance for float32
+    tol = 1e-3 if dtype == np.float32 else 1e-6
+    assert np.allclose(inertia, sum(cluster_distances), rtol=tol, atol=tol)
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/test/test_random.py
index e0b7140f1c..229baffff5 100644
--- a/python/pylibraft/pylibraft/test/test_random.py
+++ b/python/pylibraft/pylibraft/test/test_random.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 #
 
-import pytest
 import numpy as np
+import pytest
 
-from pylibraft.common import Handle
+from pylibraft.common import Handle, device_ndarray
 from pylibraft.random import rmat
 
-from pylibraft.testing.utils import TestDeviceBuffer
-
 
 def generate_theta(r_scale, c_scale):
     max_scale = max(r_scale, c_scale)
@@ -35,7 +33,7 @@ def generate_theta(r_scale, c_scale):
         theta[4 * i + 1] = b / total
         theta[4 * i + 2] = c / total
         theta[4 * i + 3] = d / total
-    theta_device = TestDeviceBuffer(theta, "C")
+    theta_device = device_ndarray(theta)
     return theta, theta_device
 
 
@@ -46,7 +44,7 @@ def generate_theta(r_scale, c_scale):
 def test_rmat(n_edges, r_scale, c_scale, dtype):
     theta, theta_device = generate_theta(r_scale, c_scale)
     out_buff = np.empty((n_edges, 2), dtype=dtype)
-    output_device = TestDeviceBuffer(out_buff, "C")
+    output_device = device_ndarray(out_buff)
 
     handle = Handle()
     rmat(output_device, theta_device, r_scale, c_scale, 12345, handle=handle)
@@ -69,7 +67,7 @@ def test_rmat_exception():
     dtype = np.int32
     with pytest.raises(Exception) as exception:
         out_buff = np.empty((n_edges, 2), dtype=dtype)
-        output_device = TestDeviceBuffer(out_buff, "C")
+        output_device = device_ndarray(out_buff)
         rmat(output_device, None, r_scale, c_scale, 12345)
         assert exception is not None
         assert exception.message == "'theta' cannot be None!"
@@ -85,7 +83,7 @@ def test_rmat_valueerror():
     r_scale = c_scale = 16
     with pytest.raises(ValueError) as exception:
         out_buff = np.empty((n_edges, 2), dtype=np.int16)
-        output_device = TestDeviceBuffer(out_buff, "C")
+        output_device = device_ndarray(out_buff)
         theta, theta_device = generate_theta(r_scale, c_scale)
         rmat(output_device, theta_device, r_scale, c_scale, 12345)
         assert exception is not None
diff --git a/python/pylibraft/pylibraft/test/test_z_interruptible.py b/python/pylibraft/pylibraft/test/test_z_interruptible.py
index bda98edd13..aac428b2d5 100644
--- a/python/pylibraft/pylibraft/test/test_z_interruptible.py
+++ b/python/pylibraft/pylibraft/test/test_z_interruptible.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
 import os
-import pytest
 import signal
 import time
+
+import pytest
+
 from pylibraft.common.interruptible import cuda_interruptible, cuda_yield
 
 
@@ -14,14 +16,15 @@ def send_ctrl_c():
 
 def test_should_cancel_via_interruptible():
     start_time = time.monotonic()
-    with pytest.raises(RuntimeError, match='this thread was cancelled'):
+    with pytest.raises(RuntimeError, match="this thread was cancelled"):
         with cuda_interruptible():
             send_ctrl_c()
             cuda_yield()
             time.sleep(1.0)
     end_time = time.monotonic()
-    assert end_time < start_time + 0.5, \
-        "The process seems to have waited, while it shouldn't have."
+    assert (
+        end_time < start_time + 0.5
+    ), "The process seems to have waited, while it shouldn't have."
 
 
 def test_should_cancel_via_python():
@@ -31,8 +34,9 @@ def test_should_cancel_via_python():
         cuda_yield()
         time.sleep(1.0)
     end_time = time.monotonic()
-    assert end_time < start_time + 0.5, \
-        "The process seems to have waited, while it shouldn't have."
+    assert (
+        end_time < start_time + 0.5
+    ), "The process seems to have waited, while it shouldn't have."
 
 
 def test_should_wait_no_interrupt():
@@ -41,8 +45,9 @@ def test_should_wait_no_interrupt():
         cuda_yield()
         time.sleep(1.0)
     end_time = time.monotonic()
-    assert end_time > start_time + 0.5, \
-        "The process seems to be cancelled, while it shouldn't be."
+    assert (
+        end_time > start_time + 0.5
+    ), "The process seems to be cancelled, while it shouldn't be."
 
 
 def test_should_wait_no_yield():
@@ -51,5 +56,6 @@ def test_should_wait_no_yield():
         send_ctrl_c()
         time.sleep(1.0)
     end_time = time.monotonic()
-    assert end_time > start_time + 0.5, \
-        "The process seems to be cancelled, while it shouldn't be."
+    assert (
+        end_time > start_time + 0.5
+    ), "The process seems to be cancelled, while it shouldn't be."
diff --git a/python/pylibraft/pylibraft/testing/utils.py b/python/pylibraft/pylibraft/testing/utils.py
deleted file mode 100644
index 979fbb5672..0000000000
--- a/python/pylibraft/pylibraft/testing/utils.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-
-import rmm
-
-
-class TestDeviceBuffer:
-
-    def __init__(self, ndarray, order):
-
-        self.ndarray_ = ndarray
-        self.device_buffer_ = \
-            rmm.DeviceBuffer.to_device(ndarray.ravel(order=order).tobytes())
-
-    @property
-    def __cuda_array_interface__(self):
-        device_cai = self.device_buffer_.__cuda_array_interface__
-        host_cai = self.ndarray_.__array_interface__.copy()
-        host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
-
-        return host_cai
-
-    def copy_to_host(self):
-        return np.frombuffer(self.device_buffer_.tobytes(),
-                             dtype=self.ndarray_.dtype,
-                             like=self.ndarray_)\
-            .astype(self.ndarray_.dtype)\
-            .reshape(self.ndarray_.shape)
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 03b3c7fa96..4711c2146e 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -18,7 +18,11 @@ requires = [
     "wheel",
     "setuptools",
     "cython>=0.29,<0.30",
+    "cuda-python>=11.7.1,<12.0",
     "scikit-build>=0.13.1",
-    "cmake>=3.23.1",
+    "cmake>=3.23.1,!=3.25.0",
+    "versioneer",
     "ninja"
 ]
+build-backend = "backend"
+backend-path = ["_custom_build"]
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
index e1f4865ac9..48a546cc30 100644
--- a/python/pylibraft/setup.cfg
+++ b/python/pylibraft/setup.cfg
@@ -1,11 +1,5 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-[flake8]
-exclude = __init__.py,versioneer.py
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-
 [versioneer]
 VCS = git
 style = pep440
@@ -34,6 +28,7 @@ known_rapids=
     rmm
 known_first_party=
     raft
+    pylibraft
 default_section=THIRDPARTY
 sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
 skip=
@@ -49,8 +44,3 @@ skip=
     build
     dist
     __init__.py
-
-[options]
-packages = find:
-install_requires = numpy
-python_requires = >=3.7,<3.9
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index 561c105d36..15889fcd71 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -14,40 +14,87 @@
 # limitations under the License.
 #
 
+import os
+
+import versioneer
 from setuptools import find_packages
 from skbuild import setup
 
-import versioneer
+cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="")
+
+install_requires = [
+    "numpy",
+    "cuda-python>=11.7.1,<12.0",
+    f"rmm{cuda_suffix}",
+]
+
+extras_require = {
+    "test": [
+        "pytest",
+        "scipy",
+        "scikit-learn",
+    ]
+}
+
+
+def exclude_libcxx_symlink(cmake_manifest):
+    return list(
+        filter(
+            lambda name: not ("include/rapids/libcxx/include" in name),
+            cmake_manifest,
+        )
+    )
+
+
+# Make versioneer produce PyPI-compatible nightly versions for wheels.
+if "RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE" in os.environ:
+    orig_get_versions = versioneer.get_versions
+
+    version_override = os.environ["RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE"]
+
+    def get_versions():
+        data = orig_get_versions()
+        data["version"] = version_override
+        return data
+
+    versioneer.get_versions = get_versions
+
 
-setup(name='pylibraft',
-      description="RAFT: Reusable Algorithms Functions and other Tools",
-      version=versioneer.get_version(),
-      classifiers=[
+setup(
+    name=f"pylibraft{cuda_suffix}",
+    description="RAFT: Reusable Algorithms Functions and other Tools",
+    version=versioneer.get_version(),
+    classifiers=[
         "Intended Audience :: Developers",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9"
-      ],
-      author="NVIDIA Corporation",
-      package_data={
-          # Note: A dict comprehension with an explicit copy is necessary
-          # (rather than something simpler like a dict.fromkeys) because
-          # otherwise every package will refer to the same list and skbuild
-          # modifies it in place.
-          key: ["*.hpp", "*.pxd"]
-          for key in find_packages(
-              include=[
-                  "pylibraft.distance",
-                  "pylibraft.distance.includes",
-                  "pylibraft.common",
-                  "pylibraft.common.includes",
-                  "pylibraft.random",
-                  "pylibraft.random.includes"
-              ]
-          )
-      },
-      packages=find_packages(include=['pylibraft', 'pylibraft.*']),
-      license="Apache",
-      cmdclass=versioneer.get_cmdclass(),
-      zip_safe=False
-      )
+        "Programming Language :: Python :: 3.9",
+    ],
+    author="NVIDIA Corporation",
+    include_package_data=True,
+    package_data={
+        # Note: A dict comprehension with an explicit copy is necessary
+        # (rather than something simpler like a dict.fromkeys) because
+        # otherwise every package will refer to the same list and skbuild
+        # modifies it in place.
+        key: ["*.hpp", "*.pxd"]
+        for key in find_packages(
+            include=[
+                "pylibraft.distance",
+                "pylibraft.distance.includes",
+                "pylibraft.common",
+                "pylibraft.common.includes",
+                "pylibraft.random",
+                "pylibraft.random.includes",
+            ]
+        )
+    },
+    install_requires=install_requires,
+    extras_require=extras_require,
+    # Don't want libcxx getting pulled into wheel builds.
+    cmake_process_manifest_hook=exclude_libcxx_symlink,
+    packages=find_packages(include=["pylibraft", "pylibraft.*"]),
+    license="Apache 2.0",
+    cmdclass=versioneer.get_cmdclass(),
+    zip_safe=False,
+)
diff --git a/python/pylibraft/setuputils.py b/python/pylibraft/setuputils.py
index d93e4b06a4..0a3f421856 100755
--- a/python/pylibraft/setuputils.py
+++ b/python/pylibraft/setuputils.py
@@ -51,15 +51,15 @@ def clean_folder(path):
     path : String
         Path to the folder to be cleaned.
     """
-    shutil.rmtree(path + '/__pycache__', ignore_errors=True)
+    shutil.rmtree(path + "/__pycache__", ignore_errors=True)
 
-    folders = glob.glob(path + '/*/')
+    folders = glob.glob(path + "/*/")
     for folder in folders:
-        shutil.rmtree(folder + '/__pycache__', ignore_errors=True)
+        shutil.rmtree(folder + "/__pycache__", ignore_errors=True)
 
         clean_folder(folder)
 
-        cython_exts = glob.glob(folder + '/*.cpp')
-        cython_exts.extend(glob.glob(folder + '/*.cpython*'))
+        cython_exts = glob.glob(folder + "/*.cpp")
+        cython_exts.extend(glob.glob(folder + "/*.cpython*"))
         for file in cython_exts:
             os.remove(file)
diff --git a/python/pylibraft/versioneer.py b/python/pylibraft/versioneer.py
index b8c4bc423b..3842748f87 100644
--- a/python/pylibraft/versioneer.py
+++ b/python/pylibraft/versioneer.py
@@ -181,7 +181,7 @@
   `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
   distributions (and upload multiple independently-installable tarballs).
 * Source trees whose main purpose is to contain a C library, but which also
-  provide bindings to Python (and perhaps other langauges) in subdirectories.
+  provide bindings to Python (and perhaps other languages) in subdirectories.
 
 Versioneer will look for `.git` in parent directories, and most operations
 should get the right version string. However `pip` and `setuptools` have bugs
@@ -278,10 +278,12 @@
 """
 
 from __future__ import print_function
+
 try:
     import configparser
 except ImportError:
     import ConfigParser as configparser
+
 import errno
 import json
 import os
@@ -309,11 +311,13 @@ def get_root():
         setup_py = os.path.join(root, "setup.py")
         versioneer_py = os.path.join(root, "versioneer.py")
     if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        err = ("Versioneer was unable to run the project root directory. "
-               "Versioneer requires setup.py to be executed from "
-               "its immediate directory (like 'python setup.py COMMAND'), "
-               "or in a way that lets it use sys.argv[0] to find the root "
-               "(like 'python path/to/setup.py COMMAND').")
+        err = (
+            "Versioneer was unable to run the project root directory. "
+            "Versioneer requires setup.py to be executed from "
+            "its immediate directory (like 'python setup.py COMMAND'), "
+            "or in a way that lets it use sys.argv[0] to find the root "
+            "(like 'python path/to/setup.py COMMAND')."
+        )
         raise VersioneerBadRootError(err)
     try:
         # Certain runtime workflows (setup.py install/develop in a setuptools
@@ -326,8 +330,10 @@ def get_root():
         me_dir = os.path.normcase(os.path.splitext(me)[0])
         vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
         if me_dir != vsr_dir:
-            print("Warning: build in %s is using versioneer.py from %s"
-                  % (os.path.dirname(me), versioneer_py))
+            print(
+                "Warning: build in %s is using versioneer.py from %s"
+                % (os.path.dirname(me), versioneer_py)
+            )
     except NameError:
         pass
     return root
@@ -349,6 +355,7 @@ def get(parser, name):
         if parser.has_option("versioneer", name):
             return parser.get("versioneer", name)
         return None
+
     cfg = VersioneerConfig()
     cfg.VCS = VCS
     cfg.style = get(parser, "style") or ""
@@ -373,17 +380,20 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Decorator to mark a method as the handler for a particular VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(
+    commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -391,10 +401,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -419,7 +432,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
     return stdout, p.returncode
 
 
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY[
+    "git"
+] = '''
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -994,7 +1009,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -1003,7 +1018,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -1011,19 +1026,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -1038,8 +1060,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -1047,10 +1070,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -1073,17 +1105,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = (
+                "unable to parse git-describe output: '%s'" % describe_out
+            )
             return pieces
 
         # tag
@@ -1092,10 +1125,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -1106,13 +1141,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(
+            GITS, ["rev-list", "HEAD", "--count"], cwd=root
+        )
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -1168,16 +1205,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -1206,11 +1249,17 @@ def versions_from_file(filename):
             contents = f.read()
     except EnvironmentError:
         raise NotThisMethod("unable to read _version.py")
-    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
-                   contents, re.M | re.S)
+    mo = re.search(
+        r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+        contents,
+        re.M | re.S,
+    )
     if not mo:
-        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
-                       contents, re.M | re.S)
+        mo = re.search(
+            r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+            contents,
+            re.M | re.S,
+        )
     if not mo:
         raise NotThisMethod("no version_json in _version.py")
     return json.loads(mo.group(1))
@@ -1219,8 +1268,9 @@ def versions_from_file(filename):
 def write_to_version_file(filename, versions):
     """Write the given version number to the given _version.py file."""
     os.unlink(filename)
-    contents = json.dumps(versions, sort_keys=True,
-                          indent=1, separators=(",", ": "))
+    contents = json.dumps(
+        versions, sort_keys=True, indent=1, separators=(",", ": ")
+    )
     with open(filename, "w") as f:
         f.write(SHORT_VERSION_PY % contents)
 
@@ -1252,8 +1302,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -1367,11 +1416,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -1391,9 +1442,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 class VersioneerBadRootError(Exception):
@@ -1416,8 +1471,9 @@ def get_versions(verbose=False):
     handlers = HANDLERS.get(cfg.VCS)
     assert handlers, "unrecognized VCS '%s'" % cfg.VCS
     verbose = verbose or cfg.verbose
-    assert cfg.versionfile_source is not None, \
-        "please set versioneer.versionfile_source"
+    assert (
+        cfg.versionfile_source is not None
+    ), "please set versioneer.versionfile_source"
     assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
 
     versionfile_abs = os.path.join(root, cfg.versionfile_source)
@@ -1471,9 +1527,13 @@ def get_versions(verbose=False):
     if verbose:
         print("unable to compute version")
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None, "error": "unable to compute version",
-            "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
 
 
 def get_version():
@@ -1522,6 +1582,7 @@ def run(self):
             print(" date: %s" % vers.get("date"))
             if vers["error"]:
                 print(" error: %s" % vers["error"])
+
     cmds["version"] = cmd_version
 
     # we override "build_py" in both distutils and setuptools
@@ -1554,14 +1615,17 @@ def run(self):
             # now locate _version.py in the new build/ directory and replace
             # it with an updated value
             if cfg.versionfile_build:
-                target_versionfile = os.path.join(self.build_lib,
-                                                  cfg.versionfile_build)
+                target_versionfile = os.path.join(
+                    self.build_lib, cfg.versionfile_build
+                )
                 print("UPDATING %s" % target_versionfile)
                 write_to_version_file(target_versionfile, versions)
+
     cmds["build_py"] = cmd_build_py
 
     if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
         from cx_Freeze.dist import build_exe as _build_exe
+
         # nczeczulin reports that py2exe won't like the pep440-style string
         # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
         # setup(console=[{
@@ -1582,17 +1646,21 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["build_exe"] = cmd_build_exe
         del cmds["build_py"]
 
-    if 'py2exe' in sys.modules:  # py2exe enabled?
+    if "py2exe" in sys.modules:  # py2exe enabled?
         try:
             from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
         except ImportError:
@@ -1611,13 +1679,17 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["py2exe"] = cmd_py2exe
 
     # we override different "sdist" commands for both environments
@@ -1644,8 +1716,10 @@ def make_release_tree(self, base_dir, files):
             # updated value
             target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
             print("UPDATING %s" % target_versionfile)
-            write_to_version_file(target_versionfile,
-                                  self._versioneer_generated_versions)
+            write_to_version_file(
+                target_versionfile, self._versioneer_generated_versions
+            )
+
     cmds["sdist"] = cmd_sdist
 
     return cmds
@@ -1700,11 +1774,15 @@ def do_setup():
     root = get_root()
     try:
         cfg = get_config_from_root(root)
-    except (EnvironmentError, configparser.NoSectionError,
-            configparser.NoOptionError) as e:
+    except (
+        EnvironmentError,
+        configparser.NoSectionError,
+        configparser.NoOptionError,
+    ) as e:
         if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
-            print("Adding sample versioneer config to setup.cfg",
-                  file=sys.stderr)
+            print(
+                "Adding sample versioneer config to setup.cfg", file=sys.stderr
+            )
             with open(os.path.join(root, "setup.cfg"), "a") as f:
                 f.write(SAMPLE_CONFIG)
         print(CONFIG_ERROR, file=sys.stderr)
@@ -1713,15 +1791,18 @@ def do_setup():
     print(" creating %s" % cfg.versionfile_source)
     with open(cfg.versionfile_source, "w") as f:
         LONG = LONG_VERSION_PY[cfg.VCS]
-        f.write(LONG % {"DOLLAR": "$",
-                        "STYLE": cfg.style,
-                        "TAG_PREFIX": cfg.tag_prefix,
-                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        })
-
-    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
-                       "__init__.py")
+        f.write(
+            LONG
+            % {
+                "DOLLAR": "$",
+                "STYLE": cfg.style,
+                "TAG_PREFIX": cfg.tag_prefix,
+                "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                "VERSIONFILE_SOURCE": cfg.versionfile_source,
+            }
+        )
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
     if os.path.exists(ipy):
         try:
             with open(ipy, "r") as f:
@@ -1763,8 +1844,10 @@ def do_setup():
     else:
         print(" 'versioneer.py' already in MANIFEST.in")
     if cfg.versionfile_source not in simple_includes:
-        print(" appending versionfile_source ('%s') to MANIFEST.in" %
-              cfg.versionfile_source)
+        print(
+            " appending versionfile_source ('%s') to MANIFEST.in"
+            % cfg.versionfile_source
+        )
         with open(manifest_in, "a") as f:
             f.write("include %s\n" % cfg.versionfile_source)
     else:
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index c6351ca622..fc93a2ddc2 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -25,39 +25,50 @@ project(
             # language to be enabled here. The test project that is built in scikit-build to verify
             # various linking options for the python library is hardcoded to build with C, so until
             # that is fixed we need to keep C.
-            C
-            CXX)
+            C CXX
+)
 
 option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       OFF)
+       OFF
+)
+
+option(RAFT_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 
 # If the user requested it we attempt to find RAFT.
 if(FIND_RAFT_CPP)
-    find_package(raft ${raft_dask_version} REQUIRED)
+  find_package(raft ${raft_dask_version} REQUIRED COMPONENTS distributed)
 else()
   set(raft_FOUND OFF)
 endif()
 
 if(NOT raft_FOUND)
-  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will
-  # pull in the required languages for the C++ project even if this project
-  # does not require those languges.
+  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
+  # languages for the C++ project even if this project does not require those languages.
   include(rapids-cuda)
   rapids_cuda_init_architectures(raft-dask)
   enable_language(CUDA)
   # Since raft-dask only enables CUDA optionally we need to manually include the file that
   # rapids_cuda_init_architectures relies on `project` including.
-  include("${CMAKE_PROJECT_raft_dask_INCLUDE}")
+  include("${CMAKE_PROJECT_raft-dask_INCLUDE}")
+  find_package(ucx REQUIRED)
 
-  # raft-dask doesn't actually use raft libraries, it just needs the headers, so
-  # we can turn off all library compilation and we don't need to install
-  # anything here.
+  # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
+  # library compilation and we don't need to install anything here.
   set(BUILD_TESTS OFF)
   set(BUILD_BENCH OFF)
   set(RAFT_COMPILE_LIBRARIES OFF)
   set(RAFT_COMPILE_DIST_LIBRARY OFF)
   set(RAFT_COMPILE_NN_LIBRARY OFF)
-  add_subdirectory(../../cpp raft-cpp)
+
+  set(_exclude_from_all "")
+  if(RAFT_BUILD_WHEELS)
+    # Statically link dependencies if building wheels
+    set(CUDA_STATIC_RUNTIME ON)
+    # Don't install the raft C++ targets into wheels
+    set(_exclude_from_all EXCLUDE_FROM_ALL)
+  endif()
+
+  add_subdirectory(../../cpp raft-cpp ${_exclude_from_all})
 endif()
 
 include(rapids-cython)
diff --git a/python/raft-dask/LICENSE b/python/raft-dask/LICENSE
new file mode 120000
index 0000000000..30cff7403d
--- /dev/null
+++ b/python/raft-dask/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/raft-dask/cmake/thirdparty/get_nccl.cmake b/python/raft-dask/cmake/thirdparty/get_nccl.cmake
index c2cc17b399..bb4b0e4dae 100644
--- a/python/raft-dask/cmake/thirdparty/get_nccl.cmake
+++ b/python/raft-dask/cmake/thirdparty/get_nccl.cmake
@@ -1,39 +1,34 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 include(rapids-find)
 function(find_and_configure_nccl)
 
-    if(TARGET NCCL::NCCL)
-        return()
-    endif()
+  if(TARGET NCCL::NCCL)
+    return()
+  endif()
 
-    rapids_find_generate_module(NCCL
-        HEADER_NAMES  nccl.h
-        LIBRARY_NAMES nccl
-    )
+  rapids_find_generate_module(
+    NCCL
+    HEADER_NAMES nccl.h
+    LIBRARY_NAMES nccl
+  )
 
-    # Currently NCCL has no CMake build-system so we require
-    # it built and installed on the machine already
-    rapids_find_package(NCCL REQUIRED)
+  # Currently NCCL has no CMake build-system so we require it built and installed on the machine
+  # already
+  rapids_find_package(NCCL REQUIRED)
 
 endfunction()
 
 find_and_configure_nccl()
-
-
-
-
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index dcdc3bbf71..0261c0b09d 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -19,7 +19,6 @@ requires = [
     "setuptools",
     "cython>=0.29,<0.30",
     "scikit-build>=0.13.1",
-    "cmake>=3.23.1",
+    "cmake>=3.23.1,!=3.25.0",
     "ninja",
-    "pylibraft"
 ]
diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index e48e31a00a..8904172272 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -5,4 +5,4 @@ markers =
   stress: marks stress tests
   mg: marks a test as multi-GPU
   memleak: marks a test as a memory leak test
-
+  nccl: marks a test as using NCCL
diff --git a/python/raft-dask/raft_dask/_version.py b/python/raft-dask/raft_dask/_version.py
index 454b0fe7aa..4eb6056f17 100644
--- a/python/raft-dask/raft_dask/_version.py
+++ b/python/raft-dask/raft_dask/_version.py
@@ -70,7 +70,7 @@ def decorate(f):
 
 
 def run_command(
-        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+    commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
 ):
     """Call the given command(s)."""
     assert isinstance(commands, list)
@@ -85,7 +85,7 @@ def run_command(
                 env=env,
                 stdout=subprocess.PIPE,
                 stderr=(subprocess.PIPE if hide_stderr else None),
-                )
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -123,7 +123,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
             return {
-                "version": dirname[len(parentdir_prefix):],
+                "version": dirname[len(parentdir_prefix) :],
                 "full-revisionid": None,
                 "dirty": False,
                 "error": None,
@@ -193,7 +193,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -210,7 +210,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
             return {
@@ -264,7 +264,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             "--long",
             "--match",
             "%s*" % tag_prefix,
-            ],
+        ],
         cwd=root,
     )
     # --long was added in git-1.5.5
@@ -299,7 +299,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
             pieces["error"] = (
-                    "unable to parse git-describe output: '%s'" % describe_out
+                "unable to parse git-describe output: '%s'" % describe_out
             )
             return pieces
 
@@ -314,7 +314,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
                 tag_prefix,
             )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 0e1da015fc..9827869b98 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -13,11 +13,10 @@
 # =============================================================================
 
 include(${raft-dask-python_SOURCE_DIR}/cmake/thirdparty/get_nccl.cmake)
-find_package(ucx REQUIRED)
 
 set(cython_sources comms_utils.pyx nccl.pyx)
-set(linked_libraries raft::raft NCCL::NCCL ucx::ucp)
+set(linked_libraries raft::raft raft::distributed NCCL::NCCL)
 rapids_cython_create_modules(
-    SOURCE_FILES "${cython_sources}"
-    LINKED_LIBRARIES "${linked_libraries}"
-    CXX)
+  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
+                                                                            CXX
+)
diff --git a/python/raft-dask/raft_dask/common/__init__.py b/python/raft-dask/raft_dask/common/__init__.py
index 8c25cdde90..c8ce695def 100644
--- a/python/raft-dask/raft_dask/common/__init__.py
+++ b/python/raft-dask/raft_dask/common/__init__.py
@@ -13,22 +13,21 @@
 # limitations under the License.
 #
 
-from .comms import Comms
-from .comms import local_handle
-
-from .comms_utils import inject_comms_on_handle
-from .comms_utils import inject_comms_on_handle_coll_only
-from .comms_utils import perform_test_comms_allreduce
-from .comms_utils import perform_test_comms_send_recv
-from .comms_utils import perform_test_comms_device_send_or_recv
-from .comms_utils import perform_test_comms_device_sendrecv
-from .comms_utils import perform_test_comms_device_multicast_sendrecv
-from .comms_utils import perform_test_comms_allgather
-from .comms_utils import perform_test_comms_gather
-from .comms_utils import perform_test_comms_gatherv
-from .comms_utils import perform_test_comms_bcast
-from .comms_utils import perform_test_comms_reduce
-from .comms_utils import perform_test_comms_reducescatter
-from .comms_utils import perform_test_comm_split
-
+from .comms import Comms, local_handle
+from .comms_utils import (
+    inject_comms_on_handle,
+    inject_comms_on_handle_coll_only,
+    perform_test_comm_split,
+    perform_test_comms_allgather,
+    perform_test_comms_allreduce,
+    perform_test_comms_bcast,
+    perform_test_comms_device_multicast_sendrecv,
+    perform_test_comms_device_send_or_recv,
+    perform_test_comms_device_sendrecv,
+    perform_test_comms_gather,
+    perform_test_comms_gatherv,
+    perform_test_comms_reduce,
+    perform_test_comms_reducescatter,
+    perform_test_comms_send_recv,
+)
 from .ucx import UCX
diff --git a/python/raft-dask/raft_dask/common/comms.py b/python/raft-dask/raft_dask/common/comms.py
index d7260ece9b..0a7fb02da7 100644
--- a/python/raft-dask/raft_dask/common/comms.py
+++ b/python/raft-dask/raft_dask/common/comms.py
@@ -13,29 +13,28 @@
 # limitations under the License.
 #
 
-from .nccl import nccl
-from .ucx import UCX
-
-from .comms_utils import inject_comms_on_handle
-from .comms_utils import inject_comms_on_handle_coll_only
-
-from .utils import parse_host_port
-from pylibraft.common.handle import Handle
-
-from dask.distributed import get_worker, default_client
-
-import warnings
-
 import logging
 import time
 import uuid
+import warnings
 from collections import OrderedDict
 
+from pylibraft.common.handle import Handle
+
+from dask.distributed import default_client, get_worker
+
+from .comms_utils import (
+    inject_comms_on_handle,
+    inject_comms_on_handle_coll_only,
+)
+from .nccl import nccl
+from .ucx import UCX
+from .utils import parse_host_port
+
 logger = logging.getLogger(__name__)
 
 
 class Comms:
-
     """
     Initializes and manages underlying NCCL and UCX comms handles across
     the workers of a Dask cluster. It is expected that `init()` will be
@@ -45,7 +44,7 @@ class Comms:
 
     Examples
     --------
-   .. code-block:: python
+    .. code-block:: python
 
         # The following code block assumes we have wrapped a C++
         # function in a Python function called `run_algorithm`,
@@ -175,7 +174,6 @@ def init(self, workers=None):
 
         Parameters
         ----------
-
         workers : Sequence
                   Unique collection of workers for initializing comms.
         """
@@ -256,7 +254,6 @@ def local_handle(sessionId):
 
     Returns
     -------
-
     handle : raft.Handle or None
     """
     state = get_raft_comm_state(sessionId, get_worker())
@@ -277,7 +274,6 @@ def get_raft_comm_state(sessionId, state_object=None):
 
     Returns
     -------
-
     session state : str
                     session state associated with sessionId
     """
diff --git a/python/raft-dask/raft_dask/common/comms_utils.pyx b/python/raft-dask/raft_dask/common/comms_utils.pyx
index 2014284af1..7db04ef455 100644
--- a/python/raft-dask/raft_dask/common/comms_utils.pyx
+++ b/python/raft-dask/raft_dask/common/comms_utils.pyx
@@ -17,14 +17,12 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from libc.stdlib cimport malloc, free
-from cython.operator cimport dereference as deref
-
 from cpython.long cimport PyLong_AsVoidPtr
-
+from cython.operator cimport dereference as deref
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport free, malloc
 from libcpp cimport bool
 
-from libc.stdint cimport uintptr_t
 
 cdef extern from "nccl.h":
 
diff --git a/python/raft-dask/raft_dask/common/nccl.pyx b/python/raft-dask/raft_dask/common/nccl.pyx
index fd113e2222..a4d59610d3 100644
--- a/python/raft-dask/raft_dask/common/nccl.pyx
+++ b/python/raft-dask/raft_dask/common/nccl.pyx
@@ -19,11 +19,11 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
-
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport free, malloc
 from libcpp cimport bool
-from libc.stdlib cimport malloc, free
+
 
 cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
     void get_unique_id(char *uid, int size) except +
diff --git a/python/raft-dask/raft_dask/common/utils.py b/python/raft-dask/raft_dask/common/utils.py
index daf51530be..78a899aa50 100644
--- a/python/raft-dask/raft_dask/common/utils.py
+++ b/python/raft-dask/raft_dask/common/utils.py
@@ -32,8 +32,8 @@ def parse_host_port(address):
     -------
     tuple with host and port info : tuple(host, port)
     """
-    if '://' in address:
-        address = address.rsplit('://', 1)[1]
-    host, port = address.split(':')
+    if "://" in address:
+        address = address.rsplit("://", 1)[1]
+    host, port = address.split(":")
     port = int(port)
     return host, port
diff --git a/python/raft-dask/raft_dask/include_test/CMakeLists.txt b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
index 5e417f42ee..e588ce1d1e 100644
--- a/python/raft-dask/raft_dask/include_test/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
@@ -15,6 +15,6 @@
 set(cython_sources raft_include_test.pyx)
 set(linked_libraries raft::raft)
 rapids_cython_create_modules(
-    SOURCE_FILES "${cython_sources}"
-    LINKED_LIBRARIES "${linked_libraries}"
-    CXX)
+  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
+                                                                            CXX
+)
diff --git a/python/raft-dask/raft_dask/include_test/raft_include_test.pyx b/python/raft-dask/raft_dask/include_test/raft_include_test.pyx
index 7d860b4c35..9b936368d1 100644
--- a/python/raft-dask/raft_dask/include_test/raft_include_test.pyx
+++ b/python/raft-dask/raft_dask/include_test/raft_include_test.pyx
@@ -15,5 +15,5 @@
 
 
 def raft_include_test():
-    print("RAFT Setup succesfully")
+    print("RAFT Setup successfully")
     return True
diff --git a/python/raft-dask/raft_dask/test/conftest.py b/python/raft-dask/raft_dask/test/conftest.py
index f5cdc49700..39ee21cbaa 100644
--- a/python/raft-dask/raft_dask/test/conftest.py
+++ b/python/raft-dask/raft_dask/test/conftest.py
@@ -1,13 +1,12 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
+import os
+
 import pytest
 
 from dask.distributed import Client
+from dask_cuda import LocalCUDACluster, initialize
 
-from dask_cuda import initialize
-from dask_cuda import LocalCUDACluster
-
-import os
 os.environ["UCX_LOG_LEVEL"] = "error"
 
 
@@ -25,14 +24,18 @@ def cluster():
 
 @pytest.fixture(scope="session")
 def ucx_cluster():
-    initialize.initialize(create_cuda_context=True,
-                          enable_tcp_over_ucx=enable_tcp_over_ucx,
-                          enable_nvlink=enable_nvlink,
-                          enable_infiniband=enable_infiniband)
-    cluster = LocalCUDACluster(protocol="ucx",
-                               enable_tcp_over_ucx=enable_tcp_over_ucx,
-                               enable_nvlink=enable_nvlink,
-                               enable_infiniband=enable_infiniband)
+    initialize.initialize(
+        create_cuda_context=True,
+        enable_tcp_over_ucx=enable_tcp_over_ucx,
+        enable_nvlink=enable_nvlink,
+        enable_infiniband=enable_infiniband,
+    )
+    cluster = LocalCUDACluster(
+        protocol="ucx",
+        enable_tcp_over_ucx=enable_tcp_over_ucx,
+        enable_nvlink=enable_nvlink,
+        enable_infiniband=enable_infiniband,
+    )
     yield cluster
     cluster.close()
 
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/test/test_comms.py
index 29b4d963d9..74ec446e94 100644
--- a/python/raft-dask/raft_dask/test/test_comms.py
+++ b/python/raft-dask/raft_dask/test/test_comms.py
@@ -13,28 +13,29 @@
 # limitations under the License.
 #
 
-import pytest
-
 from collections import OrderedDict
 
-from dask.distributed import Client
-from dask.distributed import wait
+import pytest
+
+from dask.distributed import Client, wait
 
 try:
-    from raft_dask import Comms
-    from raft_dask.common import local_handle
-    from raft_dask.common import perform_test_comms_send_recv
-    from raft_dask.common import perform_test_comms_device_send_or_recv
-    from raft_dask.common import perform_test_comms_device_sendrecv
-    from raft_dask.common import perform_test_comms_device_multicast_sendrecv
-    from raft_dask.common import perform_test_comms_allreduce
-    from raft_dask.common import perform_test_comms_bcast
-    from raft_dask.common import perform_test_comms_reduce
-    from raft_dask.common import perform_test_comms_allgather
-    from raft_dask.common import perform_test_comms_gather
-    from raft_dask.common import perform_test_comms_gatherv
-    from raft_dask.common import perform_test_comms_reducescatter
-    from raft_dask.common import perform_test_comm_split
+    from raft_dask.common import (
+        Comms,
+        local_handle,
+        perform_test_comm_split,
+        perform_test_comms_allgather,
+        perform_test_comms_allreduce,
+        perform_test_comms_bcast,
+        perform_test_comms_device_multicast_sendrecv,
+        perform_test_comms_device_send_or_recv,
+        perform_test_comms_device_sendrecv,
+        perform_test_comms_gather,
+        perform_test_comms_gatherv,
+        perform_test_comms_reduce,
+        perform_test_comms_reducescatter,
+        perform_test_comms_send_recv,
+    )
 
     pytestmark = pytest.mark.mg
 except ImportError:
diff --git a/python/raft-dask/raft_dask/test/test_raft.py b/python/raft-dask/raft_dask/test/test_raft.py
index e1370dccc9..e1e1358f58 100644
--- a/python/raft-dask/raft_dask/test/test_raft.py
+++ b/python/raft-dask/raft_dask/test/test_raft.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 #
 
-import pytest
 import sys
 
+import pytest
+
 try:
     import raft_dask
 except ImportError:
@@ -23,7 +24,7 @@
     pytestmart = pytest.mark.skip
 
 pytestmark = pytest.mark.skipif(
-    'raft_dask' not in sys.argv, reason="marker to allow integration of RAFT"
+    "raft_dask" not in sys.argv, reason="marker to allow integration of RAFT"
 )
 
 
diff --git a/python/raft-dask/setup.cfg b/python/raft-dask/setup.cfg
index c749a5a541..b005a7ab8f 100644
--- a/python/raft-dask/setup.cfg
+++ b/python/raft-dask/setup.cfg
@@ -1,11 +1,5 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-[flake8]
-exclude = __init__.py,versioneer.py
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-
 [versioneer]
 VCS = git
 style = pep440
diff --git a/python/raft-dask/setup.py b/python/raft-dask/setup.py
index 59bcc4a2dc..bef3f41b4b 100644
--- a/python/raft-dask/setup.py
+++ b/python/raft-dask/setup.py
@@ -14,37 +14,86 @@
 # limitations under the License.
 #
 
+import os
+
+import versioneer
 from setuptools import find_packages
 from skbuild import setup
 
-import versioneer
+cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="")
+
+install_requires = [
+    "numpy",
+    "numba>=0.49",
+    "joblib>=0.11",
+    "dask-cuda>=22.10",
+    "dask>=2022.9.1",
+    f"ucx-py{cuda_suffix}",
+    "distributed>=2022.9.1",
+    f"pylibraft{cuda_suffix}",
+]
+
+extras_require = {
+    "test": [
+        "pytest",
+        "dask[distributed,dataframe]",
+    ]
+}
+
+
+def exclude_libcxx_symlink(cmake_manifest):
+    return list(
+        filter(
+            lambda name: not ("include/rapids/libcxx/include" in name),
+            cmake_manifest,
+        )
+    )
+
+
+# Make versioneer produce PyPI-compatible nightly versions for wheels.
+if "RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE" in os.environ:
+    orig_get_versions = versioneer.get_versions
+
+    version_override = os.environ["RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE"]
+
+    def get_versions():
+        data = orig_get_versions()
+        data["version"] = version_override
+        return data
+
+    versioneer.get_versions = get_versions
 
 
-setup(name='raft-dask',
-      description="Reusable Accelerated Functions & Tools Dask Infrastructure",
-      version=versioneer.get_version(),
-      classifiers=[
+setup(
+    name=f"raft-dask{cuda_suffix}",
+    description="Reusable Accelerated Functions & Tools Dask Infrastructure",
+    version=versioneer.get_version(),
+    classifiers=[
         "Intended Audience :: Developers",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9"
-      ],
-      author="NVIDIA Corporation",
-      package_data={
-          # Note: A dict comprehension with an explicit copy is necessary
-          # (rather than something simpler like a dict.fromkeys) because
-          # otherwise every package will refer to the same list and skbuild
-          # modifies it in place.
-          key: ["*.hpp", "*.pxd"]
-          for key in find_packages(
-              include=[
-                  "raft_dask.common",
-                  "raft_dask.common.includes",
-              ]
-          )
-      },
-      packages=find_packages(include=['raft_dask', 'raft_dask.*']),
-      license="Apache",
-      cmdclass=versioneer.get_cmdclass(),
-      zip_safe=False
-      )
+        "Programming Language :: Python :: 3.9",
+    ],
+    author="NVIDIA Corporation",
+    include_package_data=True,
+    package_data={
+        # Note: A dict comprehension with an explicit copy is necessary
+        # (rather than something simpler like a dict.fromkeys) because
+        # otherwise every package will refer to the same list and skbuild
+        # modifies it in place.
+        key: ["*.hpp", "*.pxd"]
+        for key in find_packages(
+            include=[
+                "raft_dask.common",
+                "raft_dask.common.includes",
+            ]
+        )
+    },
+    install_requires=install_requires,
+    extras_require=extras_require,
+    cmake_process_manifest_hook=exclude_libcxx_symlink,
+    packages=find_packages(include=["raft_dask", "raft_dask.*"]),
+    license="Apache 2.0",
+    cmdclass=versioneer.get_cmdclass(),
+    zip_safe=False,
+)
diff --git a/python/raft-dask/setuputils.py b/python/raft-dask/setuputils.py
index 8893e09fd3..9370d29876 100755
--- a/python/raft-dask/setuputils.py
+++ b/python/raft-dask/setuputils.py
@@ -51,15 +51,15 @@ def clean_folder(path):
     path : String
         Path to the folder to be cleaned.
     """
-    shutil.rmtree(path + '/__pycache__', ignore_errors=True)
+    shutil.rmtree(path + "/__pycache__", ignore_errors=True)
 
-    folders = glob.glob(path + '/*/')
+    folders = glob.glob(path + "/*/")
     for folder in folders:
-        shutil.rmtree(folder + '/__pycache__', ignore_errors=True)
+        shutil.rmtree(folder + "/__pycache__", ignore_errors=True)
 
         clean_folder(folder)
 
-        cython_exts = glob.glob(folder + '/*.cpp')
-        cython_exts.extend(glob.glob(folder + '/*.cpython*'))
+        cython_exts = glob.glob(folder + "/*.cpp")
+        cython_exts.extend(glob.glob(folder + "/*.cpython*"))
         for file in cython_exts:
             os.remove(file)
diff --git a/python/raft-dask/versioneer.py b/python/raft-dask/versioneer.py
index b8c4bc423b..3842748f87 100644
--- a/python/raft-dask/versioneer.py
+++ b/python/raft-dask/versioneer.py
@@ -181,7 +181,7 @@
   `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
   distributions (and upload multiple independently-installable tarballs).
 * Source trees whose main purpose is to contain a C library, but which also
-  provide bindings to Python (and perhaps other langauges) in subdirectories.
+  provide bindings to Python (and perhaps other languages) in subdirectories.
 
 Versioneer will look for `.git` in parent directories, and most operations
 should get the right version string. However `pip` and `setuptools` have bugs
@@ -278,10 +278,12 @@
 """
 
 from __future__ import print_function
+
 try:
     import configparser
 except ImportError:
     import ConfigParser as configparser
+
 import errno
 import json
 import os
@@ -309,11 +311,13 @@ def get_root():
         setup_py = os.path.join(root, "setup.py")
         versioneer_py = os.path.join(root, "versioneer.py")
     if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        err = ("Versioneer was unable to run the project root directory. "
-               "Versioneer requires setup.py to be executed from "
-               "its immediate directory (like 'python setup.py COMMAND'), "
-               "or in a way that lets it use sys.argv[0] to find the root "
-               "(like 'python path/to/setup.py COMMAND').")
+        err = (
+            "Versioneer was unable to run the project root directory. "
+            "Versioneer requires setup.py to be executed from "
+            "its immediate directory (like 'python setup.py COMMAND'), "
+            "or in a way that lets it use sys.argv[0] to find the root "
+            "(like 'python path/to/setup.py COMMAND')."
+        )
         raise VersioneerBadRootError(err)
     try:
         # Certain runtime workflows (setup.py install/develop in a setuptools
@@ -326,8 +330,10 @@ def get_root():
         me_dir = os.path.normcase(os.path.splitext(me)[0])
         vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
         if me_dir != vsr_dir:
-            print("Warning: build in %s is using versioneer.py from %s"
-                  % (os.path.dirname(me), versioneer_py))
+            print(
+                "Warning: build in %s is using versioneer.py from %s"
+                % (os.path.dirname(me), versioneer_py)
+            )
     except NameError:
         pass
     return root
@@ -349,6 +355,7 @@ def get(parser, name):
         if parser.has_option("versioneer", name):
             return parser.get("versioneer", name)
         return None
+
     cfg = VersioneerConfig()
     cfg.VCS = VCS
     cfg.style = get(parser, "style") or ""
@@ -373,17 +380,20 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Decorator to mark a method as the handler for a particular VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(
+    commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -391,10 +401,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -419,7 +432,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
     return stdout, p.returncode
 
 
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY[
+    "git"
+] = '''
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -994,7 +1009,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -1003,7 +1018,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -1011,19 +1026,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -1038,8 +1060,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -1047,10 +1070,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -1073,17 +1105,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = (
+                "unable to parse git-describe output: '%s'" % describe_out
+            )
             return pieces
 
         # tag
@@ -1092,10 +1125,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -1106,13 +1141,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(
+            GITS, ["rev-list", "HEAD", "--count"], cwd=root
+        )
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -1168,16 +1205,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -1206,11 +1249,17 @@ def versions_from_file(filename):
             contents = f.read()
     except EnvironmentError:
         raise NotThisMethod("unable to read _version.py")
-    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
-                   contents, re.M | re.S)
+    mo = re.search(
+        r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+        contents,
+        re.M | re.S,
+    )
     if not mo:
-        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
-                       contents, re.M | re.S)
+        mo = re.search(
+            r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+            contents,
+            re.M | re.S,
+        )
     if not mo:
         raise NotThisMethod("no version_json in _version.py")
     return json.loads(mo.group(1))
@@ -1219,8 +1268,9 @@ def versions_from_file(filename):
 def write_to_version_file(filename, versions):
     """Write the given version number to the given _version.py file."""
     os.unlink(filename)
-    contents = json.dumps(versions, sort_keys=True,
-                          indent=1, separators=(",", ": "))
+    contents = json.dumps(
+        versions, sort_keys=True, indent=1, separators=(",", ": ")
+    )
     with open(filename, "w") as f:
         f.write(SHORT_VERSION_PY % contents)
 
@@ -1252,8 +1302,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -1367,11 +1416,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -1391,9 +1442,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 class VersioneerBadRootError(Exception):
@@ -1416,8 +1471,9 @@ def get_versions(verbose=False):
     handlers = HANDLERS.get(cfg.VCS)
     assert handlers, "unrecognized VCS '%s'" % cfg.VCS
     verbose = verbose or cfg.verbose
-    assert cfg.versionfile_source is not None, \
-        "please set versioneer.versionfile_source"
+    assert (
+        cfg.versionfile_source is not None
+    ), "please set versioneer.versionfile_source"
     assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
 
     versionfile_abs = os.path.join(root, cfg.versionfile_source)
@@ -1471,9 +1527,13 @@ def get_versions(verbose=False):
     if verbose:
         print("unable to compute version")
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None, "error": "unable to compute version",
-            "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
 
 
 def get_version():
@@ -1522,6 +1582,7 @@ def run(self):
             print(" date: %s" % vers.get("date"))
             if vers["error"]:
                 print(" error: %s" % vers["error"])
+
     cmds["version"] = cmd_version
 
     # we override "build_py" in both distutils and setuptools
@@ -1554,14 +1615,17 @@ def run(self):
             # now locate _version.py in the new build/ directory and replace
             # it with an updated value
             if cfg.versionfile_build:
-                target_versionfile = os.path.join(self.build_lib,
-                                                  cfg.versionfile_build)
+                target_versionfile = os.path.join(
+                    self.build_lib, cfg.versionfile_build
+                )
                 print("UPDATING %s" % target_versionfile)
                 write_to_version_file(target_versionfile, versions)
+
     cmds["build_py"] = cmd_build_py
 
     if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
         from cx_Freeze.dist import build_exe as _build_exe
+
         # nczeczulin reports that py2exe won't like the pep440-style string
         # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
         # setup(console=[{
@@ -1582,17 +1646,21 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["build_exe"] = cmd_build_exe
         del cmds["build_py"]
 
-    if 'py2exe' in sys.modules:  # py2exe enabled?
+    if "py2exe" in sys.modules:  # py2exe enabled?
         try:
             from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
         except ImportError:
@@ -1611,13 +1679,17 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["py2exe"] = cmd_py2exe
 
     # we override different "sdist" commands for both environments
@@ -1644,8 +1716,10 @@ def make_release_tree(self, base_dir, files):
             # updated value
             target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
             print("UPDATING %s" % target_versionfile)
-            write_to_version_file(target_versionfile,
-                                  self._versioneer_generated_versions)
+            write_to_version_file(
+                target_versionfile, self._versioneer_generated_versions
+            )
+
     cmds["sdist"] = cmd_sdist
 
     return cmds
@@ -1700,11 +1774,15 @@ def do_setup():
     root = get_root()
     try:
         cfg = get_config_from_root(root)
-    except (EnvironmentError, configparser.NoSectionError,
-            configparser.NoOptionError) as e:
+    except (
+        EnvironmentError,
+        configparser.NoSectionError,
+        configparser.NoOptionError,
+    ) as e:
         if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
-            print("Adding sample versioneer config to setup.cfg",
-                  file=sys.stderr)
+            print(
+                "Adding sample versioneer config to setup.cfg", file=sys.stderr
+            )
             with open(os.path.join(root, "setup.cfg"), "a") as f:
                 f.write(SAMPLE_CONFIG)
         print(CONFIG_ERROR, file=sys.stderr)
@@ -1713,15 +1791,18 @@ def do_setup():
     print(" creating %s" % cfg.versionfile_source)
     with open(cfg.versionfile_source, "w") as f:
         LONG = LONG_VERSION_PY[cfg.VCS]
-        f.write(LONG % {"DOLLAR": "$",
-                        "STYLE": cfg.style,
-                        "TAG_PREFIX": cfg.tag_prefix,
-                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        })
-
-    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
-                       "__init__.py")
+        f.write(
+            LONG
+            % {
+                "DOLLAR": "$",
+                "STYLE": cfg.style,
+                "TAG_PREFIX": cfg.tag_prefix,
+                "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                "VERSIONFILE_SOURCE": cfg.versionfile_source,
+            }
+        )
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
     if os.path.exists(ipy):
         try:
             with open(ipy, "r") as f:
@@ -1763,8 +1844,10 @@ def do_setup():
     else:
         print(" 'versioneer.py' already in MANIFEST.in")
     if cfg.versionfile_source not in simple_includes:
-        print(" appending versionfile_source ('%s') to MANIFEST.in" %
-              cfg.versionfile_source)
+        print(
+            " appending versionfile_source ('%s') to MANIFEST.in"
+            % cfg.versionfile_source
+        )
         with open(manifest_in, "a") as f:
             f.write("include %s\n" % cfg.versionfile_source)
     else:
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000..fc3d7c0d41
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+[flake8]
+filename = *.py, *.pyx, *.pxd, *.pxi
+exclude = __init__.py, *.egg, build, docs, .git, versioneer.py
+force-check = True
+ignore =
+    # line break before binary operator
+    W503,
+    # whitespace before :
+    E203
+per-file-ignores =
+    # Rules ignored only in Cython:
+    # E211: whitespace before '(' (used in multi-line imports)
+    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
+    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
+    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
+    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
+    # E402: invalid syntax (works for Python, not Cython)
+    # E999: invalid syntax (works for Python, not Cython)
+    # W504: line break after binary operator (breaks lines that end with a pointer)
+    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
+
+[pydocstyle]
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
+# than include using match-dir. Note that as discussed in
+# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
+# unlike the match option above this match-dir will have no effect when
+# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
+# also be maintained in the pre-commit config file.
+match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$
+# Allow missing docstrings for docutils
+ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
+select =
+    D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418
+    # Would like to enable the following rules in the future:
+    # D200, D202, D205, D400
+
+[mypy]
+ignore_missing_imports = True
+# If we don't specify this, then mypy will check excluded files if
+# they are imported by a checked file.
+follow_imports = skip
+exclude=_version.py
+
+[codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b
+ignore-words-list = inout,unparseable,numer
+builtin = clear
+quiet-level = 3