diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d22af4779e..3c8cc4912d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -52,6 +52,7 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+      skip_upload_pkgs: libraft-template
   docs-build:
     if: github.ref_type == 'branch' && github.event_name == 'push'
     needs: python-build
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index bf080d6ad2..cf8f8cd4b5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -105,7 +105,7 @@ jobs:
       build_type: pull-request
       package-name: raft_dask
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
       test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index f1207c3545..ebf596c958 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,6 +51,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: raft_dask
-      test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7606914589..d6e4ecb676 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -62,7 +62,7 @@ repos:
                 entry: ./cpp/scripts/run-cmake-format.sh cmake-format
                 language: python
                 types: [cmake]
-                exclude: .*/thirdparty/.*
+                exclude: .*/thirdparty/.*|.*FindAVX.cmake.*
                 # Note that pre-commit autoupdate does not update the versions
                 # of dependencies, so we'll have to update this manually.
                 additional_dependencies:
@@ -101,7 +101,7 @@ repos:
                 args: ["--toml", "pyproject.toml"]
                 exclude: (?x)^(^CHANGELOG.md$)
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.4.0
+        rev: v1.5.1
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/README.md b/README.md
index a178d90008..b77e906262 100755
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools</div>
 
+![Navigating the canyons of accelerated possibilities](img/raft.png)
+
 ## Resources
 
 - [RAFT Reference Documentation](https://docs.rapids.ai/api/raft/stable/): API Documentation.
@@ -32,12 +34,16 @@ While not exhaustive, the following general categories help summarize the accele
 | **Tools & Utilities** | common utilities for developing CUDA applications, multi-node multi-gpu infrastructure |
 
 
-All of RAFT's C++ APIs can be accessed header-only and optional pre-compiled shared libraries can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
+RAFT is a C++ header-only template library with an optional shared library that 
+1) can speed up compile times for common template types, and 
+2) provides host-accessible "runtime" APIs, which don't require a CUDA compiler to use
 
-In addition to the C++ library, RAFT also provides 2 Python libraries:
-- `pylibraft` - lightweight low-level Python wrappers around RAFT's host-accessible "runtime" APIs.
+In addition being a C++ library, RAFT also provides 2 Python libraries:
+- `pylibraft` - lightweight Python wrappers around RAFT's host-accessible "runtime" APIs.
 - `raft-dask` - multi-node multi-GPU communicator infrastructure for building distributed algorithms on the GPU with Dask.
 
+![RAFT is a C++ header-only template library with optional shared library and lightweight Python wrappers](img/arch.png)
+
 ## Getting started
 
 ### RAPIDS Memory Manager (RMM)
@@ -78,9 +84,9 @@ raft::device_resources handle;
 int n_samples = 5000;
 int n_features = 50;
 
-auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
-auto labels = raft::make_device_vector<int>(handle, n_samples);
-auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
+auto input = raft::make_device_matrix<float, int>(handle, n_samples, n_features);
+auto labels = raft::make_device_vector<int, int>(handle, n_samples);
+auto output = raft::make_device_matrix<float, int>(handle, n_samples, n_samples);
 
 raft::random::make_blobs(handle, input.view(), labels.view());
 
@@ -192,8 +198,7 @@ RAFT itself can be installed through conda, [CMake Package Manager (CPM)](https:
 
 The easiest way to install RAFT is through conda and several packages are provided.
 - `libraft-headers` RAFT headers
-- `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
-- `libraft-distance` (optional) contains shared libraries for distance primitives.
+- `libraft` (optional) shared library of pre-compiled template specializations and runtime APIs.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 
@@ -202,73 +207,35 @@ Use the following command to install all of the RAFT packages with conda (replac
 mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft
 ```
 
-You can also install the `libraft-*` conda packages individually using the `mamba` command above.
+You can also install the conda packages individually using the `mamba` command above.
 
-After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
+After installing RAFT, `find_package(raft COMPONENTS compiled distributed)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
 
 ### Pip
 
 pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
 ```bash
-pip install pylibraft-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-pip install raft-dask-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install pylibraft-cu11 --extra-index-url=https://pypi.nvidia.com
+pip install raft-dask-cu11 --extra-index-url=https://pypi.nvidia.com
 ```
 
 ### CMake & CPM
 
-RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS CMake provides a convenience layer around CPM.
-
-After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will make available several targets to add to configure the link libraries for your artifacts.
-
-```cmake
-
-set(RAFT_VERSION "22.12")
-set(RAFT_FORK "rapidsai")
-set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
-
-function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG COMPILE_LIBRARIES)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-                            "${multiValueArgs}" ${ARGN} )
-
-  #-----------------------------------------------------
-  # Invoke CPM find_package()
-  #-----------------------------------------------------
-
-  rapids_cpm_find(raft ${PKG_VERSION}
-          GLOBAL_TARGETS      raft::raft
-          BUILD_EXPORT_SET    projname-exports
-          INSTALL_EXPORT_SET  projname-exports
-          CPM_ARGS
-          GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
-          GIT_TAG        ${PKG_PINNED_TAG}
-          SOURCE_SUBDIR  cpp
-          OPTIONS
-          "BUILD_TESTS OFF"
-          "BUILD_BENCH OFF"
-          "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
-  )
-
-endfunction()
-
-# Change pinned tag here to test a commit in CI
-# To use a different RAFT locally, set the CMake variable
-# CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
-        FORK             ${RAFT_FORK}
-        PINNED_TAG       ${RAFT_PINNED_TAG}
-        COMPILE_LIBRARIES      NO
-)
-```
+RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it easy to include in downstream cmake projects. RAPIDS-CMake provides a convenience layer around CPM. Please refer to [these instructions](https://github.com/rapidsai/rapids-cmake#installation) to install and use rapids-cmake in your project.
+
+#### Example Template Project
+
+You can find an [example RAFT](cpp/template/README.md) project template in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing cmake project.
+
+#### CMake Targets
 
-Several CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies.
+Additional CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies.
 
-| Component | Target | Description | Base Dependencies |
-| --- | --- | --- | --- |
-| n/a | `raft::raft` | Full RAFT header library | CUDA toolkit library, RMM, Thrust (optional), NVTools (optional) |
-| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::raft, cuCollections (optional)  |
-| nn | `raft::nn` | Pre-compiled template specializations for raft::neighbors | raft::raft, FAISS (optional) |
-| distributed | `raft::distributed` | No specializations | raft::raft, UCX, NCCL |
+| Component   | Target              | Description                                               | Base Dependencies                     |
+|-------------|---------------------|-----------------------------------------------------------|---------------------------------------|
+| n/a         | `raft::raft`        | Full RAFT header library                                  | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
+| compiled    | `raft::compiled`    | Pre-compiled template specializations and runtime library | raft::raft                            |
+| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                       | raft::raft, UCX, NCCL                 |
 
 ### Source
 
@@ -279,7 +246,7 @@ mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86
 mamba activate raft_dev_env
 ```
 ```
-./build.sh raft-dask pylibraft libraft tests bench --compile-libs
+./build.sh raft-dask pylibraft libraft tests bench --compile-lib
 ```
 
 The [build](docs/source/build.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](docs/source/build.md#building-raft-c-from-source-in-cmake) section of the build instructions.
@@ -316,6 +283,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
   - `internal`: A private header-only component that hosts the code shared between benchmarks and tests.
   - `scripts`: Helpful scripts for development
   - `src`: Compiled APIs and template specializations for the shared libraries
+  - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT.
   - `test`: Googletests source code
 - `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs)
 - `python`: Source code for Python libraries.
diff --git a/build.sh b/build.sh
index 575f6bdaa1..7e1a3e7e36 100755
--- a/build.sh
+++ b/build.sh
@@ -2,7 +2,7 @@
 
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# raft build script
+# raft build scripts
 
 # This script is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
@@ -15,11 +15,11 @@ NUMARGS=$#
 ARGS=$*
 
 # NOTE: ensure all dir changes are relative to the location of this
-# script, and that this script resides in the repo dir!
+# scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests bench clean --uninstall  -v -g -n --compile-libs --compile-nn --compile-dist --allgpuarch --no-nvtx --show_depr_warn -h --buildfaiss --minimal-deps"
-HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench=<targets>]
+VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall  -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn --time -h"
+HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-prims=<targets>] [--limit-bench-ann=<targets>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
@@ -28,29 +28,28 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    raft-dask        - build the raft-dask Python package. this also requires pylibraft.
    docs             - build the documentation
    tests            - build the tests
-   bench            - build the benchmarks
+   bench-prims      - build micro-benchmarks for primitives
+   bench-ann        - build end-to-end ann benchmarks
+   template         - build the example RAFT application template
 
  and <flag> is:
    -v                          - verbose build mode
    -g                          - build for debug
    -n                          - no install step
    --uninstall                 - uninstall files for specified targets which were built and installed prior
-   --compile-libs              - compile shared libraries for all components
-   --compile-nn                - compile shared library for nn component
-   --compile-dist              - compile shared library for distance and current random components
-                                 (eventually, this will be renamed to something more generic and
-                                  the only option to be supported)
-   --minimal-deps              - disables dependencies like thrust so they can be overridden.
+   --compile-lib               - compile shared libraries for all components
                                  can be useful for a pure header-only install
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
-   --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. NEIGHBORS_BENCH;CLUSTER_BENCH)
+   --limit-bench-prims         - semicolon-separated list of prims benchmark executables to compute (e.g. NEIGHBORS_PRIMS_BENCH;CLUSTER_PRIMS_BENCH)
+   --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
-   --buildfaiss                - build faiss statically into raft
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
    --cmake-args=\\\"<args>\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument)
    --cache-tool=<tool>         - pass the build cache tool (eg: ccache, sccache, distcc) that will be used
                                  to speedup the build process.
+   --time                      - Enable nvcc compilation time logging into cpp/build/nvcc_compile_log.csv.
+                                 Results can be interpreted with cpp/scripts/analyze_nvcc_log.py
    -h                          - print this text
 
  default action (no args) is to build libraft, tests, pylibraft and raft-dask targets
@@ -68,20 +67,17 @@ VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=OFF
 BUILD_TYPE=Release
-BUILD_BENCH=OFF
-BUILD_STATIC_FAISS=OFF
-COMPILE_LIBRARIES=OFF
-COMPILE_NN_LIBRARY=OFF
-COMPILE_DIST_LIBRARY=OFF
-ENABLE_NN_DEPENDENCIES=OFF
+BUILD_PRIMS_BENCH=OFF
+BUILD_ANN_BENCH=OFF
+COMPILE_LIBRARY=OFF
 INSTALL_TARGET=install
 
 TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
 BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
-ENABLE_thrust_DEPENDENCY=ON
 
 CACHE_ARGS=""
 NVTX=ON
+LOG_COMPILE_TIME=OFF
 CLEAN=0
 UNINSTALL=0
 DISABLE_DEPRECATION_WARNINGS=ON
@@ -161,15 +157,30 @@ function limitTests {
 
 function limitBench {
     # Check for option to limit the set of test binaries to build
-    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench" || true; } ) ]]; then
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-prims" || true; } ) ]]; then
         # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
         # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
         # on the invalid option error
-        LIMIT_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench=//' -e 's/ .*//')
-        if [[ -n ${LIMIT_BENCH_TARGETS} ]]; then
+        LIMIT_PRIMS_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-prims=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_PRIMS_BENCH_TARGETS} ]]; then
+            # Remove the full LIMIT_PRIMS_BENCH_TARGETS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//--limit-bench-prims=$LIMIT_PRIMS_BENCH_TARGETS/}
+            PRIMS_BENCH_TARGETS=${LIMIT_PRIMS_BENCH_TARGETS}
+        fi
+    fi
+}
+
+function limitAnnBench {
+    # Check for option to limit the set of test binaries to build
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-ann" || true; } ) ]]; then
+        # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
+        # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
+        # on the invalid option error
+        LIMIT_ANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-ann=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_ANN_BENCH_TARGETS} ]]; then
             # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
-            ARGS=${ARGS//--limit-bench=$LIMIT_BENCH_TARGETS/}
-            BENCH_TARGETS=${LIMIT_BENCH_TARGETS}
+            ARGS=${ARGS//--limit-bench-ann=$LIMIT_ANN_BENCH_TARGETS/}
+            ANN_BENCH_TARGETS=${LIMIT_ANN_BENCH_TARGETS}
         fi
     fi
 }
@@ -185,6 +196,7 @@ if (( ${NUMARGS} != 0 )); then
     cacheTool
     limitTests
     limitBench
+    limitAnnBench
     for a in ${ARGS}; do
         if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
             echo "Invalid option: ${a}"
@@ -257,10 +269,6 @@ if hasArg -n; then
     INSTALL_TARGET=""
 fi
 
-if hasArg --minimal-deps; then
-    ENABLE_thrust_DEPENDENCY=OFF
-fi
-
 if hasArg -v; then
     VERBOSE_FLAG="-v"
     CMAKE_LOG_LEVEL="VERBOSE"
@@ -273,37 +281,16 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
-if hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
-    COMPILE_LIBRARIES=ON
-fi
-
-if hasArg --compile-nn || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
-    ENABLE_NN_DEPENDENCIES=ON
-    COMPILE_NN_LIBRARY=ON
-    CMAKE_TARGET="${CMAKE_TARGET};raft_nn_lib"
-fi
-
-if hasArg --compile-dist || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
-    COMPILE_DIST_LIBRARY=ON
-    CMAKE_TARGET="${CMAKE_TARGET};raft_distance_lib"
+if hasArg --compile-lib || (( ${NUMARGS} == 0 )); then
+    COMPILE_LIBRARY=ON
+    CMAKE_TARGET="${CMAKE_TARGET};raft_lib"
 fi
 
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
 
-    # Force compile nn library when needed test targets are specified
-    if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \
-          $CMAKE_TARGET == *"SPARSE_DIST_TEST"* || \
-          $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_TEST"* || \
-          $CMAKE_TARGET == *"STATS_TEST"* ]]; then
-      echo "-- Enabling nearest neighbors lib for gtests"
-      ENABLE_NN_DEPENDENCIES=ON
-      COMPILE_NN_LIBRARY=ON
-    fi
-
-    # Force compile distance library when needed test targets are specified
+    # Force compile library when needed test targets are specified
     if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \
           $CMAKE_TARGET == *"DISTANCE_TEST"* || \
           $CMAKE_TARGET == *"SPARSE_DIST_TEST" || \
@@ -311,39 +298,37 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then
           $CMAKE_TARGET == *"MATRIX_TEST"* || \
           $CMAKE_TARGET == *"NEIGHBORS_TEST" || \
           $CMAKE_TARGET == *"STATS_TEST"* ]]; then
-      echo "-- Enabling distance lib for gtests"
-      COMPILE_DIST_LIBRARY=ON
+      echo "-- Enabling compiled lib for gtests"
+      COMPILE_LIBRARY=ON
     fi
 fi
 
-if hasArg bench || (( ${NUMARGS} == 0 )); then
-    BUILD_BENCH=ON
-    CMAKE_TARGET="${CMAKE_TARGET};${BENCH_TARGETS}"
+if hasArg bench-prims || (( ${NUMARGS} == 0 )); then
+    BUILD_PRIMS_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${PRIMS_BENCH_TARGETS}"
 
-    # Force compile nn library when needed benchmark targets are specified
-    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_BENCH"*  ]]; then
-      echo "-- Enabling nearest neighbors lib for benchmarks"
-      ENABLE_NN_DEPENDENCIES=ON
-      COMPILE_NN_LIBRARY=ON
+    # Force compile library when needed benchmark targets are specified
+    if [[ $CMAKE_TARGET == *"CLUSTER_PRIMS_BENCH"* || \
+          $CMAKE_TARGET == *"MATRIX_PRIMS_BENCH"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_PRIMS_BENCH"* ]]; then
+      echo "-- Enabling compiled lib for benchmarks"
+      COMPILE_LIBRARY=ON
     fi
-
-    # Force compile distance library when needed benchmark targets are specified
-    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
-          $CMAKE_TARGET == *"MATRIX_BENCH"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_BENCH"* ]]; then
-      echo "-- Enabling distance lib for benchmarks"
-      COMPILE_DIST_LIBRARY=ON
-    fi
-
 fi
 
-if hasArg --buildfaiss; then
-    BUILD_STATIC_FAISS=ON
+if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
+    BUILD_ANN_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
+    COMPILE_LIBRARY=ON
 fi
+
 if hasArg --no-nvtx; then
     NVTX=OFF
 fi
+if hasArg --time; then
+    echo "-- Logging compile times to cpp/build/nvcc_compile_log.csv"
+    LOG_COMPILE_TIME=ON
+fi
 if hasArg --show_depr_warn; then
     DISABLE_DEPRECATION_WARNINGS=OFF
 fi
@@ -351,8 +336,6 @@ if hasArg clean; then
     CLEAN=1
 fi
 
-
-
 if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
@@ -386,7 +369,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -401,17 +384,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-          -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
-          -DRAFT_ENABLE_NN_DEPENDENCIES=${ENABLE_NN_DEPENDENCIES} \
+          -DRAFT_COMPILE_LIBRARY=${COMPILE_LIBRARY} \
           -DRAFT_NVTX=${NVTX} \
+          -DCUDA_LOG_COMPILE_TIME=${LOG_COMPILE_TIME} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
-          -DBUILD_BENCH=${BUILD_BENCH} \
+          -DBUILD_PRIMS_BENCH=${BUILD_PRIMS_BENCH} \
+          -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
-          -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
-          -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
-          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
-          -DRAFT_ENABLE_thrust_DEPENDENCY=${ENABLE_thrust_DEPENDENCY} \
           ${CACHE_ARGS} \
           ${EXTRA_CMAKE_ARGS}
 
@@ -425,34 +405,34 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
   fi
 fi
 
-# Build and (optionally) install the raft-dask Python package
-if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
+# Build and (optionally) install the pylibraft Python package
+if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
     # Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
     if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
         EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
     fi
-
-    cd ${REPODIR}/python/raft-dask
+    cd ${REPODIR}/python/pylibraft
     python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} ${EXTRA_CMAKE_ARGS}
     fi
 fi
 
-# Build and (optionally) install the pylibraft Python package
-if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
+# Build and (optionally) install the raft-dask Python package
+if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
     # Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
     if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
         EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
     fi
 
-    cd ${REPODIR}/python/pylibraft
+    cd ${REPODIR}/python/raft-dask
     python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} ${EXTRA_CMAKE_ARGS}
     fi
 fi
 
+
 if hasArg docs; then
     set -x
     cd ${DOXYGEN_BUILD_DIR}
@@ -460,3 +440,12 @@ if hasArg docs; then
     cd ${SPHINX_BUILD_DIR}
     sphinx-build -b html source _html
 fi
+
+################################################################################
+# Initiate build for example RAFT application template (if needed)
+
+if hasArg template; then
+    pushd cpp/template
+    ./build.sh
+    popd
+fi
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 78c4399d28..5db6fa11be 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -24,9 +24,8 @@ VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libraft-distance \
+  libraft \
   libraft-headers \
-  libraft-nn \
   pylibraft \
   raft-dask
 
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index a44314a6ce..123aeba87b 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -192,7 +192,8 @@ def checkCopyright_main():
                            action="append",
                            required=False,
                            default=["python/cuml/_thirdparty/",
-                                    "cpp/include/raft/thirdparty/"],
+                                    "cpp/include/raft/thirdparty/",
+                                    "cpp/cmake/modules/FindAVX.cmake"],
                            help=("Exclude the paths specified (regexp). "
                                  "Can be specified multiple times."))
 
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
index ed3d2a15fd..efc8f0c77c 100755
--- a/ci/release/apply_wheel_modifications.sh
+++ b/ci/release/apply_wheel_modifications.sh
@@ -6,10 +6,6 @@
 VERSION=${1}
 CUDA_SUFFIX=${2}
 
-# __init__.py versions
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/pylibraft/pylibraft/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/raft-dask/raft_dask/__init__.py
-
 # pyproject.toml versions
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/pylibraft/pyproject.toml
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/raft-dask/pyproject.toml
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 44e446d8f6..e32697a68a 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -26,7 +26,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libraft-headers libraft-distance libraft-nn libraft-tests
+  libraft-headers libraft libraft-tests
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 934c9c6951..cb6b7631e4 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -31,7 +31,7 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libraft-distance libraft-headers pylibraft raft-dask
+  libraft libraft-headers pylibraft raft-dask
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/wheel_smoke_test_raft_dask.py b/ci/wheel_smoke_test_raft_dask.py
index 32c13e61ca..5709ac901c 100644
--- a/ci/wheel_smoke_test_raft_dask.py
+++ b/ci/wheel_smoke_test_raft_dask.py
@@ -1,4 +1,19 @@
-from dask.distributed import Client, wait
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dask.distributed import Client, get_worker, wait
 from dask_cuda import LocalCUDACluster, initialize
 
 from raft_dask.common import (
@@ -23,12 +38,12 @@
 
 
 def func_test_send_recv(sessionId, n_trials):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return perform_test_comms_send_recv(handle, n_trials)
 
 
 def func_test_collective(func, sessionId, root):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return func(handle, root)
 
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 9d447116a3..3706b31344 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -18,14 +18,14 @@ dependencies:
 - cupy
 - cxx-compiler
 - cython>=0.29,<0.30
-- dask-cuda=23.06
+- dask-cuda==23.6.*
 - dask>=2023.1.1
 - distributed>=2023.1.1
 - doxygen>=1.8.20
-- faiss-proc=*=cuda
 - gcc_linux-64=11.*
 - graphviz
 - ipython
+- joblib>=0.11
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -34,14 +34,16 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libfaiss>=1.7.1=cuda*
+- nccl>=2.9.9
 - ninja
+- numba>=0.49
+- numpy>=1.21
 - numpydoc
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
 - recommonmark
-- rmm=23.06
+- rmm==23.6.*
 - scikit-build>=0.13.1
 - scikit-learn
 - scipy
@@ -49,6 +51,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py=0.32.*
+- ucx-py==0.32.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
new file mode 100644
index 0000000000..5965aaef8f
--- /dev/null
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,37 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- c-compiler
+- clang-tools=11.1.0
+- clang=11.1.0
+- cmake>=3.23.1,!=3.25.0
+- cuda-profiler-api=11.8.86
+- cudatoolkit=11.8
+- cxx-compiler
+- cython>=0.29,<0.30
+- faiss-proc=*=cuda
+- gcc_linux-64=11.*
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib=0.7.0
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- libfaiss>=1.7.1
+- nccl>=2.9.9
+- ninja
+- nlohmann_json>=3.11.2
+- scikit-build>=0.13.1
+- sysroot_linux-64==2.17
+name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/libraft/build_libraft_nn.sh b/conda/recipes/libraft/build_libraft.sh
similarity index 54%
rename from conda/recipes/libraft/build_libraft_nn.sh
rename to conda/recipes/libraft/build_libraft.sh
index 5347bfbc20..237e47eb26 100644
--- a/conda/recipes/libraft/build_libraft_nn.sh
+++ b/conda/recipes/libraft/build_libraft.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-./build.sh libraft --allgpuarch --compile-nn --no-nvtx
+./build.sh libraft --allgpuarch --compile-lib --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_distance.sh b/conda/recipes/libraft/build_libraft_distance.sh
deleted file mode 100644
index 27a1ee43c8..0000000000
--- a/conda/recipes/libraft/build_libraft_distance.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-./build.sh libraft --allgpuarch --compile-dist --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_nn_bench.sh b/conda/recipes/libraft/build_libraft_nn_bench.sh
new file mode 100644
index 0000000000..dc6250f0f4
--- /dev/null
+++ b/conda/recipes/libraft/build_libraft_nn_bench.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+./build.sh tests bench-ann --allgpuarch --no-nvtx
+cmake --install cpp/build --component ann_bench
diff --git a/conda/recipes/libraft/build_libraft_template.sh b/conda/recipes/libraft/build_libraft_template.sh
new file mode 100644
index 0000000000..9759402884
--- /dev/null
+++ b/conda/recipes/libraft/build_libraft_template.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+
+# Just building template so we verify it uses libraft.so and fail if it doesn't build
+./build.sh template
\ No newline at end of file
diff --git a/conda/recipes/libraft/build_libraft_tests.sh b/conda/recipes/libraft/build_libraft_tests.sh
index aa2c1b3e89..cc28f93fb8 100644
--- a/conda/recipes/libraft/build_libraft_tests.sh
+++ b/conda/recipes/libraft/build_libraft_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-./build.sh tests bench --allgpuarch --no-nvtx
+./build.sh tests bench-prims --allgpuarch --no-nvtx
 cmake --install cpp/build --component testing
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index ca213dc317..2a66f213a7 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -19,8 +19,17 @@ nccl_version:
 gtest_version:
   - "=1.10.0"
 
-libfaiss_version:
-  - "1.7.2 *_cuda"
+glog_version:
+  - ">=0.6.0"
+
+faiss_version:
+  - ">=1.7.1"
+
+h5py_version:
+  - ">=3.8.0"
+
+nlohmann_json_version:
+  - ">=3.11.2"
 
 # The CTK libraries below are missing from the conda-forge::cudatoolkit
 # package. The "*_host_*" version specifiers correspond to `11.8` packages and the
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 771c7d55b8..7859807777 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -76,9 +76,9 @@ outputs:
       home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft-headers library
-  - name: libraft-distance
+  - name: libraft
     version: {{ version }}
-    script: build_libraft_distance.sh
+    script: build_libraft.sh
     build:
       script_env: *script_env
       number: {{ GIT_DESCRIBE_NUMBER }}
@@ -109,10 +109,10 @@ outputs:
     about:
       home: https://rapids.ai/
       license: Apache-2.0
-      summary: libraft-distance library
-  - name: libraft-nn
+      summary: libraft library
+  - name: libraft-tests
     version: {{ version }}
-    script: build_libraft_nn.sh
+    script: build_libraft_tests.sh
     build:
       script_env: *script_env
       number: {{ GIT_DESCRIBE_NUMBER }}
@@ -128,10 +128,11 @@ outputs:
         - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
+        - {{ pin_subpackage('libraft', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - cuda-profiler-api {{ cuda_profiler_api_host_version }}
-        - faiss-proc=*=cuda
-        - lapack
+        - gmock {{ gtest_version }}
+        - gtest {{ gtest_version }}
         - libcublas {{ libcublas_host_version }}
         - libcublas-dev {{ libcublas_host_version }}
         - libcurand {{ libcurand_host_version }}
@@ -140,18 +141,18 @@ outputs:
         - libcusolver-dev {{ libcusolver_host_version }}
         - libcusparse {{ libcusparse_host_version }}
         - libcusparse-dev {{ libcusparse_host_version }}
-        - libfaiss {{ libfaiss_version }}
       run:
-        - faiss-proc=*=cuda
-        - libfaiss {{ libfaiss_version }}
+        - {{ pin_subpackage('libraft', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - gmock {{ gtest_version }}
+        - gtest {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
-      summary: libraft-nn library
-  - name: libraft-tests
+      summary: libraft tests
+  - name: libraft-template
     version: {{ version }}
-    script: build_libraft_tests.sh
+    script: build_libraft_template.sh
     build:
       script_env: *script_env
       number: {{ GIT_DESCRIBE_NUMBER }}
@@ -167,12 +168,9 @@ outputs:
         - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
-        - {{ pin_subpackage('libraft-distance', exact=True) }}
+        - {{ pin_subpackage('libraft', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
-        - {{ pin_subpackage('libraft-nn', exact=True) }}
         - cuda-profiler-api {{ cuda_profiler_api_host_version }}
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
         - libcublas {{ libcublas_host_version }}
         - libcublas-dev {{ libcublas_host_version }}
         - libcurand {{ libcurand_host_version }}
@@ -182,12 +180,53 @@ outputs:
         - libcusparse {{ libcusparse_host_version }}
         - libcusparse-dev {{ libcusparse_host_version }}
       run:
-        - {{ pin_subpackage('libraft-distance', exact=True) }}
+        - {{ pin_subpackage('libraft', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
-        - {{ pin_subpackage('libraft-nn', exact=True) }}
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
-      summary: libraft tests
+      summary: libraft template
+  - name: libraft-ann-bench
+    version: {{ version }}
+    script: build_libraft_nn_bench.sh
+    build:
+      script_env: *script_env
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      ignore_run_exports_from:
+        - {{ compiler('cuda') }}
+    requirements:
+      build:
+        - {{ compiler('c') }}
+        - {{ compiler('cuda') }} {{ cuda_version }}
+        - {{ compiler('cxx') }}
+        - cmake {{ cmake_version }}
+        - ninja
+        - sysroot_{{ target_platform }} {{ sysroot_version }}
+      host:
+        - {{ pin_subpackage('libraft', exact=True) }}
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
+        - glog {{ glog_version }}
+        - nlohmann_json {{ nlohmann_json_version }}
+        - libfaiss>=1.7.1
+        - faiss-proc=*=cuda
+      run:
+        - {{ pin_subpackage('libraft', exact=True) }}
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - glog {{ glog_version }}
+        - faiss-proc=*=cuda
+        - libfaiss {{ faiss_version }}
+        - h5py {{ h5py_version }}
+    about:
+      home: https://rapids.ai/
+      license: Apache-2.0
+      summary: libraft ann bench
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 4a9b98ac75..7730801801 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -34,8 +34,9 @@ requirements:
     - cuda-python >=11.7.1,<12.0
     - cudatoolkit ={{ cuda_version }}
     - cython >=0.29,<0.30
-    - libraft-distance {{ version }}
+    - libraft {{ version }}
     - libraft-headers {{ version }}
+    - numpy >=1.21
     - python x.x
     - rmm ={{ minor_version }}
     - scikit-build >=0.13.1
@@ -43,7 +44,7 @@ requirements:
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - cuda-python >=11.7.1,<12.0
-    - libraft-distance {{ version }}
+    - libraft {{ version }}
     - libraft-headers {{ version }}
     - python x.x
 
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index ef22522116..778b187870 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -11,7 +11,7 @@ sysroot_version:
   - "2.17"
 
 ucx_version:
-  - "1.13.0"
+  - ">=1.13.0,<1.15.0"
 
 ucx_py_version:
   - "0.31.*"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index b387f0f47c..59a67fe148 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -54,7 +54,7 @@ requirements:
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - ucx >={{ ucx_version }}
+    - ucx {{ ucx_version }}
     - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 840321c3fa..87b3e7269b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -46,63 +46,47 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_SHARED_LIBS "Build raft shared libraries" ON)
 option(BUILD_TESTS "Build raft unit-tests" ON)
-option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
+option(BUILD_PRIMS_BENCH "Build raft C++ benchmark tests" OFF)
+option(BUILD_ANN_BENCH "Build raft ann benchmarks" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
 )
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
+option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(RAFT_NVTX "Enable nvtx markers" OFF)
 
-set(RAFT_COMPILE_LIBRARIES_DEFAULT OFF)
-if(BUILD_TESTS OR BUILD_BENCH)
-  set(RAFT_COMPILE_LIBRARIES_DEFAULT ON)
-endif()
-option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations"
-       ${RAFT_COMPILE_LIBRARIES_DEFAULT}
-)
-option(
-  RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations"
-  ${RAFT_COMPILE_LIBRARIES}
-)
-option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations"
-       ${RAFT_COMPILE_LIBRARIES}
+set(RAFT_COMPILE_LIBRARY_DEFAULT OFF)
+if(BUILD_TESTS
+   OR BUILD_PRIMS_BENCH
+   OR BUILD_ANN_BENCH
 )
-option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss"
-       ${RAFT_COMPILE_NN_LIBRARY}
+  set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
+endif()
+option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
+       ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-option(RAFT_ENABLE_thrust_DEPENDENCY "Enable Thrust dependency" ON)
-
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS
+   OR BUILD_PRIMS_BENCH
+   OR BUILD_ANN_BENCH
+)
   # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
   # to have different values for the `Threads::Threads` target. Setting this flag ensures
   # `Threads::Threads` is the same value in first run and subsequent runs.
   set(THREADS_PREFER_PTHREAD_FLAG ON)
 endif()
 
-if(BUILD_TESTS AND NOT RAFT_ENABLE_thrust_DEPENDENCY)
-  message(VERBOSE "RAFT: BUILD_TESTS is enabled, overriding RAFT_ENABLE_thrust_DEPENDENCY")
-  set(RAFT_ENABLE_thrust_DEPENDENCY ON)
-endif()
-
-option(RAFT_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' target" ON)
-
 include(CMakeDependentOption)
-cmake_dependent_option(
-  RAFT_USE_FAISS_STATIC
-  "Build and statically link the FAISS library for nearest neighbors search on GPU"
-  ON
-  RAFT_COMPILE_LIBRARIES
-  OFF
-)
+# cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for
+# nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF )
 
 message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}")
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
-message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
+message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_PRIMS_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -168,36 +152,22 @@ include(cmake/modules/ConfigureCUDA.cmake)
 # ##################################################################################################
 # * Requirements -------------------------------------------------------------
 
-if(RAFT_COMPILE_LIBRARIES)
-  set(RAFT_COMPILE_DIST_LIBRARY ON)
-  set(RAFT_COMPILE_NN_LIBRARY ON)
-endif()
-
-if(RAFT_COMPILE_DIST_LIBRARY OR distance IN_LIST raft_FIND_COMPONENTS)
-  set(RAFT_ENABLE_cuco_DEPENDENCY ON)
-endif()
-
 # add third party dependencies using CPM
 rapids_cpm_init()
 
 # thrust before rmm/cuco so we get the right version of thrust/cub
 include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
-include(cmake/thirdparty/get_faiss.cmake)
 include(cmake/thirdparty/get_cutlass.cmake)
 
-if(RAFT_ENABLE_cuco_DEPENDENCY)
-  include(${rapids-cmake-dir}/cpm/cuco.cmake)
-  rapids_cpm_cuco(
-    BUILD_EXPORT_SET raft-distance-lib-exports INSTALL_EXPORT_SET raft-distance-lib-exports
-  )
-endif()
+include(${rapids-cmake-dir}/cpm/cuco.cmake)
+rapids_cpm_cuco(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
 endif()
 
-if(BUILD_BENCH)
+if(BUILD_PRIMS_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench()
 endif()
@@ -215,11 +185,13 @@ target_include_directories(
 target_link_libraries(
   raft
   INTERFACE rmm::rmm
+            cuco::cuco
+            nvidia::cutlass::cutlass
             CUDA::cublas${_ctk_static_suffix}
             CUDA::curand${_ctk_static_suffix}
             CUDA::cusolver${_ctk_static_suffix}
             CUDA::cusparse${_ctk_static_suffix}
-            $<$<BOOL:${RAFT_ENABLE_thrust_DEPENDENCY}>:raft::Thrust>
+            raft::Thrust
 )
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -237,7 +209,7 @@ else()
   target_compile_definitions(raft INTERFACE RAFT_SYSTEM_LITTLE_ENDIAN=1)
 endif()
 
-if(RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
+if(RAFT_COMPILE_LIBRARY)
   file(
     WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
     [=[
@@ -281,148 +253,200 @@ target_compile_definitions(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:NVTX_ENAB
 endif()
 
 # ##################################################################################################
-# * raft_distance ------------------------------------------------------------ TODO: Currently, this
+# * raft_compiled ------------------------------------------------------------ TODO: Currently, this
 #   package also contains the 'random' namespace (for rmat logic) We couldn't get this to work
 #   properly due to strange CI failures as noticed in the PR#778. In the long term, we should rename
 #   this package to `raft_compiled` in order to have a single pre-compiled raft package for those
 #   who need it.
-add_library(raft_distance INTERFACE)
+add_library(raft_compiled INTERFACE)
 
-if(TARGET raft_distance AND (NOT TARGET raft::distance))
-  add_library(raft::distance ALIAS raft_distance)
+if(TARGET raft_compiled AND (NOT TARGET raft::compiled))
+  add_library(raft::compiled ALIAS raft_compiled)
 endif()
 
-set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
+set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 
-if(RAFT_COMPILE_DIST_LIBRARY)
+if(RAFT_COMPILE_LIBRARY)
   add_library(
-    raft_distance_lib
-    src/distance/distance/pairwise_distance.cu
-    src/distance/distance/fused_l2_min_arg.cu
-    src/distance/cluster/update_centroids_float.cu
-    src/distance/cluster/update_centroids_double.cu
-    src/distance/cluster/cluster_cost_float.cu
-    src/distance/cluster/cluster_cost_double.cu
-    src/distance/neighbors/refine_d_int64_t_float.cu
-    src/distance/neighbors/refine_d_int64_t_int8_t.cu
-    src/distance/neighbors/refine_d_int64_t_uint8_t.cu
-    src/distance/neighbors/refine_h_int64_t_float.cu
-    src/distance/neighbors/refine_h_int64_t_int8_t.cu
-    src/distance/neighbors/refine_h_int64_t_uint8_t.cu
-    src/distance/neighbors/specializations/refine_d_int64_t_float.cu
-    src/distance/neighbors/specializations/refine_d_int64_t_int8_t.cu
-    src/distance/neighbors/specializations/refine_d_int64_t_uint8_t.cu
-    src/distance/neighbors/specializations/refine_h_int64_t_float.cu
-    src/distance/neighbors/specializations/refine_h_int64_t_int8_t.cu
-    src/distance/neighbors/specializations/refine_h_int64_t_uint8_t.cu
-    src/distance/cluster/kmeans_fit_float.cu
-    src/distance/cluster/kmeans_fit_double.cu
-    src/distance/cluster/kmeans_init_plus_plus_double.cu
-    src/distance/cluster/kmeans_init_plus_plus_float.cu
-    src/distance/distance/specializations/detail/canberra_double_double_double_int.cu
-    src/distance/distance/specializations/detail/canberra_float_float_float_int.cu
-    src/distance/distance/specializations/detail/correlation_double_double_double_int.cu
-    src/distance/distance/specializations/detail/correlation_float_float_float_int.cu
-    src/distance/distance/specializations/detail/cosine_double_double_double_int.cu
-    src/distance/distance/specializations/detail/cosine_float_float_float_int.cu
-    src/distance/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/inner_product_float_float_float_int.cu
-    src/distance/distance/specializations/detail/inner_product_double_double_double_int.cu
-    src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
-    src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
-    src/distance/distance/specializations/detail/kernels/gram_matrix_base_double.cu
-    src/distance/distance/specializations/detail/kernels/gram_matrix_base_float.cu
-    src/distance/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
-    src/distance/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+    raft_lib
+    src/distance/pairwise_distance.cu
+    src/distance/fused_l2_min_arg.cu
+    src/cluster/update_centroids_float.cu
+    src/cluster/update_centroids_double.cu
+    src/cluster/cluster_cost_float.cu
+    src/cluster/cluster_cost_double.cu
+    src/neighbors/refine_d_int64_t_float.cu
+    src/neighbors/refine_d_int64_t_int8_t.cu
+    src/neighbors/refine_d_int64_t_uint8_t.cu
+    src/neighbors/refine_h_int64_t_float.cu
+    src/neighbors/refine_h_int64_t_int8_t.cu
+    src/neighbors/refine_h_int64_t_uint8_t.cu
+    src/neighbors/specializations/refine_d_int64_t_float.cu
+    src/neighbors/specializations/refine_d_int64_t_int8_t.cu
+    src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
+    src/neighbors/specializations/refine_h_int64_t_float.cu
+    src/neighbors/specializations/refine_h_int64_t_int8_t.cu
+    src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
+    src/cluster/kmeans_fit_float.cu
+    src/cluster/kmeans_fit_double.cu
+    src/cluster/kmeans_init_plus_plus_double.cu
+    src/cluster/kmeans_init_plus_plus_float.cu
+    src/distance/specializations/detail/canberra_double_double_double_int.cu
+    src/distance/specializations/detail/canberra_float_float_float_int.cu
+    src/distance/specializations/detail/correlation_double_double_double_int.cu
+    src/distance/specializations/detail/correlation_float_float_float_int.cu
+    src/distance/specializations/detail/cosine_double_double_double_int.cu
+    src/distance/specializations/detail/cosine_float_float_float_int.cu
+    src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
+    src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
+    src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
+    src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
+    src/distance/specializations/detail/inner_product_float_float_float_int.cu
+    src/distance/specializations/detail/inner_product_double_double_double_int.cu
+    src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+    src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
     # These are somehow missing a kernel definition which is causing a compile error.
     # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
     # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
-    src/distance/distance/specializations/detail/kernels/tanh_kernel_double.cu
-    src/distance/distance/specializations/detail/kernels/tanh_kernel_float.cu
-    src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu
-    src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l1_float_float_float_int.cu
-    src/distance/distance/specializations/detail/l1_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
-    src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
-    src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
-    src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
-    src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
-    src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu
-    src/distance/distance/specializations/fused_l2_nn_double_int.cu
-    src/distance/distance/specializations/fused_l2_nn_double_int64.cu
-    src/distance/distance/specializations/fused_l2_nn_float_int.cu
-    src/distance/distance/specializations/fused_l2_nn_float_int64.cu
-    src/distance/matrix/specializations/detail/select_k_float_uint32_t.cu
-    src/distance/matrix/specializations/detail/select_k_float_int64_t.cu
-    src/distance/matrix/specializations/detail/select_k_half_uint32_t.cu
-    src/distance/matrix/specializations/detail/select_k_half_int64_t.cu
-    src/distance/neighbors/ivf_flat_search.cu
-    src/distance/neighbors/ivf_flat_build.cu
-    src/distance/neighbors/specializations/ivfflat_build_float_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_extend_float_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_search_float_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
-    src/distance/neighbors/ivfpq_build.cu
-    src/distance/neighbors/ivfpq_deserialize.cu
-    src/distance/neighbors/ivfpq_serialize.cu
-    src/distance/neighbors/ivfpq_search_float_int64_t.cu
-    src/distance/neighbors/ivfpq_search_int8_t_int64_t.cu
-    src/distance/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_build_float_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_extend_float_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_search_float_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
-    src/distance/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
-    src/distance/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
-    src/distance/random/rmat_rectangular_generator_int_double.cu
-    src/distance/random/rmat_rectangular_generator_int64_double.cu
-    src/distance/random/rmat_rectangular_generator_int_float.cu
-    src/distance/random/rmat_rectangular_generator_int64_float.cu
+    src/neighbors/brute_force_knn_int64_t_float.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
+    src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+    src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+    src/distance/specializations/detail/l1_float_float_float_int.cu
+    src/distance/specializations/detail/l1_double_double_double_int.cu
+    src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+    src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+    src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+    src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+    src/distance/specializations/detail/l_inf_double_double_double_int.cu
+    src/distance/specializations/detail/l_inf_float_float_float_int.cu
+    src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+    src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+    src/distance/specializations/detail/russel_rao_double_double_double_int.cu
+    src/distance/specializations/detail/russel_rao_float_float_float_int.cu
+    src/distance/specializations/fused_l2_nn_double_int.cu
+    src/distance/specializations/fused_l2_nn_double_int64.cu
+    src/distance/specializations/fused_l2_nn_float_int.cu
+    src/distance/specializations/fused_l2_nn_float_int64.cu
+    src/matrix/specializations/detail/select_k_float_uint32_t.cu
+    src/matrix/specializations/detail/select_k_float_int64_t.cu
+    src/matrix/specializations/detail/select_k_half_uint32_t.cu
+    src/matrix/specializations/detail/select_k_half_int64_t.cu
+    src/neighbors/ivfpq_build.cu
+    src/neighbors/ivfpq_deserialize.cu
+    src/neighbors/ivfpq_serialize.cu
+    src/neighbors/ivfpq_search_float_int64_t.cu
+    src/neighbors/ivfpq_search_int8_t_int64_t.cu
+    src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_build_float_int64_t.cu
+    src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
+    src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_search_float_int64_t.cu
+    src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
+    src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
+    src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
+    src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
+    src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
+    src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
+    src/random/rmat_rectangular_generator_int_double.cu
+    src/random/rmat_rectangular_generator_int64_double.cu
+    src/random/rmat_rectangular_generator_int_float.cu
+    src/random/rmat_rectangular_generator_int64_float.cu
+    src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+    src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+    src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+    src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+    src/neighbors/specializations/ball_cover_all_knn_query.cu
+    src/neighbors/specializations/ball_cover_build_index.cu
+    src/neighbors/specializations/ball_cover_knn_query.cu
+    src/neighbors/specializations/fused_l2_knn_long_float_true.cu
+    src/neighbors/specializations/fused_l2_knn_long_float_false.cu
+    src/neighbors/specializations/fused_l2_knn_int_float_true.cu
+    src/neighbors/specializations/fused_l2_knn_int_float_false.cu
+    src/neighbors/ivf_flat_search.cu
+    src/neighbors/ivf_flat_build.cu
+    src/neighbors/specializations/ivfflat_build_float_int64_t.cu
+    src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
+    src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfflat_search_float_int64_t.cu
+    src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
+    src/neighbors/ivfpq_build.cu
+    src/neighbors/ivfpq_deserialize.cu
+    src/neighbors/ivfpq_serialize.cu
+    src/neighbors/ivfpq_search_float_int64_t.cu
+    src/neighbors/ivfpq_search_int8_t_int64_t.cu
+    src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_build_float_int64_t.cu
+    src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
+    src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_search_float_int64_t.cu
+    src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
+    src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
+    src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
+    src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
+    src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
+    src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
+    src/random/rmat_rectangular_generator_int_double.cu
+    src/random/rmat_rectangular_generator_int64_double.cu
+    src/random/rmat_rectangular_generator_int_float.cu
+    src/random/rmat_rectangular_generator_int64_float.cu
   )
   set_target_properties(
-    raft_distance_lib
-    PROPERTIES OUTPUT_NAME raft_distance
+    raft_lib
+    PROPERTIES OUTPUT_NAME raft
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
                CXX_STANDARD 17
@@ -433,95 +457,23 @@ if(RAFT_COMPILE_DIST_LIBRARY)
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  target_link_libraries(
-    raft_distance_lib
-    PUBLIC raft::raft cuco::cuco
-    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-  )
+  target_link_libraries(raft_lib PUBLIC raft::raft $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>)
   target_compile_options(
-    raft_distance_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                              "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-  )
-  target_compile_definitions(raft_distance_lib INTERFACE "RAFT_DISTANCE_COMPILED")
-
-  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-  target_link_options(raft_distance_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
-
-endif()
-
-if(TARGET raft_distance_lib AND (NOT TARGET raft::raft_distance_lib))
-  add_library(raft::raft_distance_lib ALIAS raft_distance_lib)
-endif()
-
-target_link_libraries(
-  raft_distance INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
-                          nvidia::cutlass::cutlass
-)
-
-# ##################################################################################################
-# * raft_nn ------------------------------------------------------------------
-add_library(raft_nn INTERFACE)
-
-if(TARGET raft_nn AND (NOT TARGET raft::nn))
-  add_library(raft::nn ALIAS raft_nn)
-endif()
-
-set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
-
-if(RAFT_COMPILE_NN_LIBRARY)
-  add_library(
-    raft_nn_lib
-    src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
-    src/nn/specializations/ball_cover_all_knn_query.cu
-    src/nn/specializations/ball_cover_build_index.cu
-    src/nn/specializations/ball_cover_knn_query.cu
-    src/nn/specializations/fused_l2_knn_long_float_true.cu
-    src/nn/specializations/fused_l2_knn_long_float_false.cu
-    src/nn/specializations/fused_l2_knn_int_float_true.cu
-    src/nn/specializations/fused_l2_knn_int_float_false.cu
-    src/nn/specializations/brute_force_knn_long_float_int.cu
-    src/nn/specializations/brute_force_knn_long_float_uint.cu
-    src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
-    src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
-  )
-  set_target_properties(
-    raft_nn_lib
-    PROPERTIES OUTPUT_NAME raft_nn
-               BUILD_RPATH "\$ORIGIN"
-               INSTALL_RPATH "\$ORIGIN"
-               CXX_STANDARD 17
-               CXX_STANDARD_REQUIRED ON
-               CUDA_STANDARD 17
-               CUDA_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
+    raft_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                     "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
+  target_compile_definitions(raft_lib INTERFACE "RAFT_COMPILED")
 
-  target_link_libraries(
-    raft_nn_lib
-    PUBLIC faiss::faiss raft::raft
-    PRIVATE nvidia::cutlass::cutlass
-  )
-  target_compile_options(
-    raft_nn_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-  )
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-  target_link_options(raft_nn_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+  target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
-  target_compile_definitions(raft_nn_lib INTERFACE "RAFT_NN_COMPILED")
 endif()
 
-if(TARGET raft_nn_lib AND (NOT TARGET raft::raft_nn_lib))
-  add_library(raft::raft_nn_lib ALIAS raft_nn_lib)
+if(TARGET raft_lib AND (NOT TARGET raft::raft_lib))
+  add_library(raft::raft_lib ALIAS raft_lib)
 endif()
 
-target_link_libraries(
-  raft_nn INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib> nvidia::cutlass::cutlass
-)
+target_link_libraries(raft_compiled INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib>)
 
 # ##################################################################################################
 # * raft_distributed -------------------------------------------------------------------------------
@@ -562,39 +514,23 @@ install(
 )
 
 install(
-  TARGETS raft_distance
-  DESTINATION ${lib_dir}
-  COMPONENT raft
-  EXPORT raft-distance-exports
-)
-
-install(
-  TARGETS raft_nn
+  TARGETS raft_compiled
   DESTINATION ${lib_dir}
   COMPONENT raft
-  EXPORT raft-nn-exports
+  EXPORT raft-compiled-exports
 )
 
-if(TARGET raft_distance_lib)
+if(TARGET raft_lib)
   install(
-    TARGETS raft_distance_lib
+    TARGETS raft_lib
     DESTINATION ${lib_dir}
-    COMPONENT distance
-    EXPORT raft-distance-lib-exports
+    COMPONENT compiled
+    EXPORT raft-compiled-lib-exports
   )
   install(
     DIRECTORY include/raft_runtime
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    COMPONENT distance
-  )
-endif()
-
-if(TARGET raft_nn_lib)
-  install(
-    TARGETS raft_nn_lib
-    DESTINATION ${lib_dir}
-    COMPONENT nn
-    EXPORT raft-nn-lib-exports
+    COMPONENT compiled
   )
 endif()
 
@@ -629,15 +565,11 @@ install(
 
 include("${rapids-cmake-dir}/export/write_dependencies.cmake")
 
-set(raft_components distance nn distributed)
-set(raft_install_comp raft raft raft)
-if(TARGET raft_distance_lib)
-  list(APPEND raft_components distance-lib)
-  list(APPEND raft_install_comp distance)
-endif()
-if(TARGET raft_nn_lib)
-  list(APPEND raft_components nn-lib)
-  list(APPEND raft_install_comp nn)
+set(raft_components compiled distributed)
+set(raft_install_comp raft raft)
+if(TARGET raft_lib)
+  list(APPEND raft_components compiled-lib)
+  list(APPEND raft_install_comp compiled)
 endif()
 
 foreach(comp install_comp IN ZIP_LISTS raft_components raft_install_comp)
@@ -673,14 +605,12 @@ RAFT contains fundamental widely-used algorithms and primitives
 for data science and machine learning.
 
 Optional Components:
-  - nn
-  - distance
+  - compiled
   - distributed
 
 Imported Targets:
   - raft::raft
-  - raft::nn brought in by the `nn` optional component
-  - raft::distance brought in by the `distance` optional component
+  - raft::compiled brought in by the `compiled` optional component
   - raft::distributed brought in by the `distributed` optional component
 
 ]=]
@@ -688,34 +618,22 @@ Imported Targets:
 
 set(code_string ${nvtx_export_string})
 
-if(RAFT_ENABLE_thrust_DEPENDENCY)
-  string(
-    APPEND
-    code_string
-    [=[
-  if(NOT TARGET raft::Thrust)
-    thrust_create_target(raft::Thrust FROM_OPTIONS)
-  endif()
-  ]=]
-  )
-endif()
-
 string(
   APPEND
   code_string
   [=[
-if(distance IN_LIST raft_FIND_COMPONENTS)
-  enable_language(CUDA)
+if(NOT TARGET raft::Thrust)
+  thrust_create_target(raft::Thrust FROM_OPTIONS)
 endif()
+]=]
+)
 
-if(nn IN_LIST raft_FIND_COMPONENTS)
+string(
+  APPEND
+  code_string
+  [=[
+if(compiled IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
-
-  if(TARGET faiss AND (NOT TARGET faiss::faiss))
-      add_library(faiss::faiss ALIAS faiss)
-  elseif(TARGET faiss::faiss AND (NOT TARGET faiss))
-      add_library(faiss ALIAS faiss::faiss)
-  endif()
 endif()
 ]=]
 )
@@ -723,21 +641,21 @@ endif()
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(
-  INSTALL raft COMPONENTS nn distance distributed EXPORT_SET raft-exports GLOBAL_TARGETS raft nn
-  distance distributed NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string
+  INSTALL raft COMPONENTS compiled distributed EXPORT_SET raft-exports GLOBAL_TARGETS raft compiled
+  distributed NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string
 )
 
 # ##################################################################################################
 # * build export -------------------------------------------------------------
 raft_export(
-  BUILD raft EXPORT_SET raft-exports COMPONENTS nn distance distributed GLOBAL_TARGETS raft
-  distance distributed nn DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string
+  BUILD raft EXPORT_SET raft-exports COMPONENTS compiled distributed GLOBAL_TARGETS raft compiled
+  distributed DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string
 )
 
 # ##################################################################################################
 # * shared test/bench headers ------------------------------------------------
 
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   include(internal/CMakeLists.txt)
 endif()
 
@@ -751,6 +669,13 @@ endif()
 # ##################################################################################################
 # * build benchmark executable -----------------------------------------------
 
-if(BUILD_BENCH)
-  include(bench/CMakeLists.txt)
+if(BUILD_PRIMS_BENCH)
+  include(bench/prims/CMakeLists.txt)
+endif()
+
+# ##################################################################################################
+# * build ann benchmark executable -----------------------------------------------
+
+if(BUILD_ANN_BENCH)
+  include(bench/ann/CMakeLists.txt)
 endif()
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index e2324de654..d92ccba8e3 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 function(ConfigureBench)
 
-  set(options OPTIONAL DIST NN)
+  set(options OPTIONAL LIB)
   set(oneValueArgs NAME)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -31,8 +31,7 @@ function(ConfigureBench)
     ${BENCH_NAME}
     PRIVATE raft::raft
             raft_internal
-            $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
-            $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
+            $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
             benchmark::benchmark
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
@@ -70,7 +69,12 @@ endfunction()
 if(BUILD_BENCH)
   ConfigureBench(
     NAME CLUSTER_BENCH PATH bench/cluster/kmeans_balanced.cu bench/cluster/kmeans.cu bench/main.cpp
-    OPTIONAL DIST NN
+    OPTIONAL LIB
+  )
+
+  ConfigureBench(
+    NAME TUNE_DISTANCE PATH bench/distance/tune_pairwise/kernel.cu
+    bench/distance/tune_pairwise/bench.cu bench/main.cpp
   )
 
   ConfigureBench(
@@ -86,7 +90,7 @@ if(BUILD_BENCH)
     bench/distance/kernels.cu
     bench/main.cpp
     OPTIONAL
-    DIST
+    LIB
   )
 
   ConfigureBench(
@@ -106,7 +110,7 @@ if(BUILD_BENCH)
 
   ConfigureBench(
     NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/matrix/select_k.cu
-    bench/main.cpp OPTIONAL DIST
+    bench/main.cpp OPTIONAL LIB
   )
 
   ConfigureBench(
@@ -132,7 +136,6 @@ if(BUILD_BENCH)
     bench/neighbors/refine_uint8_t_int64_t.cu
     bench/main.cpp
     OPTIONAL
-    DIST
-    NN
+    LIB
   )
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
new file mode 100644
index 0000000000..6267be518e
--- /dev/null
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -0,0 +1,160 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
+
+find_package(Threads REQUIRED)
+
+set(RAFT_ANN_BENCH_USE_FAISS OFF)
+if(RAFT_ANN_BENCH_USE_FAISS_BFKNN
+   OR RAFT_ANN_BENCH_USE_FAISS_IVFPQ
+   OR RAFT_ANN_BENCH_USE_FAISS_IFFLAT
+)
+  set(RAFT_ANN_BENCH_USE_FAISS ON)
+endif()
+
+set(RAFT_ANN_BENCH_USE_RAFT OFF)
+if(RAFT_ANN_BENCH_USE_RAFT_BFKNN
+   OR RAFT_ANN_BENCH_USE_RAFT_IVFPQ
+   OR RAFT_ANN_BENCH_USE_RAFT_IVFFLAT
+)
+  set(RAFT_ANN_BENCH_USE_RAFT ON)
+endif()
+
+if(RAFT_ANN_BENCH_USE_HNSWLIB)
+  include(cmake/thirdparty/get_hnswlib.cmake)
+endif()
+
+option(RAFT_ANN_BENCH_USE_MULTIGPU "Use multi-gpus (where possible) in benchmarks" OFF)
+
+include(cmake/thirdparty/get_nlohmann_json.cmake)
+
+if(RAFT_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_ggnn.cmake)
+endif()
+
+if(RAFT_ANN_BENCH_USE_FAISS)
+  include(cmake/thirdparty/get_faiss.cmake)
+endif()
+
+function(ConfigureAnnBench)
+
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH LINKS CXXFLAGS INCLUDES)
+
+  cmake_parse_arguments(
+    ConfigureAnnBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
+  )
+
+  set(BENCH_NAME ${ConfigureAnnBench_NAME}_ANN_BENCH)
+
+  add_executable(
+    ${BENCH_NAME} ${ConfigureAnnBench_PATH} bench/ann/src/common/conf.cpp
+                  bench/ann/src/common/util.cpp
+  )
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            $<$<BOOL:${RAFT_ANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
+            ${ConfigureAnnBench_LINKS}
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  set(${ConfigureAnnBench_CXXFLAGS} ${RAFT_CXX_FLAGS} ${ConfigureAnnBench_CXXFLAGS})
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ConfigureAnnBench_CXXFLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  if(RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME})
+    target_compile_definitions(
+      ${BENCH_NAME}
+      PUBLIC
+        RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}=RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}
+    )
+  endif()
+
+  target_include_directories(
+    ${BENCH_NAME}
+    PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+    PRIVATE ${ConfigureAnnBench_INCLUDES}
+  )
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT ann_bench
+    DESTINATION bin/ann
+    EXCLUDE_FROM_ALL
+  )
+endfunction()
+
+if(RAFT_ANN_BENCH_USE_HNSWLIB)
+  ConfigureAnnBench(
+    NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS "${HNSW_CXX_FLAGS}"
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_RAFT)
+  ConfigureAnnBench(
+    NAME
+    RAFT_IVF_PQ
+    PATH
+    bench/ann/src/raft/raft_benchmark.cu
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft/raft_ivf_pq.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft/raft_ivf_flat.cu>
+    LINKS
+    raft::compiled
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_FAISS)
+  ConfigureAnnBench(
+    NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_glog.cmake)
+  ConfigureAnnBench(
+    NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include LINKS glog::glog
+  )
+endif()
diff --git a/cpp/bench/ann/README.md b/cpp/bench/ann/README.md
new file mode 100644
index 0000000000..1a8af2e448
--- /dev/null
+++ b/cpp/bench/ann/README.md
@@ -0,0 +1,3 @@
+# RAFT CUDA ANN Benchmarks
+
+Please see the [ANN Benchmarks](https://docs.rapids.ai/api/raft/stable/cuda_ann_benchmarks.html) section of the RAFT documentation for instructions on building and using the ANN benchmarks.
\ No newline at end of file
diff --git a/cpp/bench/ann/conf/bigann-100M.json b/cpp/bench/ann/conf/bigann-100M.json
new file mode 100644
index 0000000000..5f16f3378d
--- /dev/null
+++ b/cpp/bench/ann/conf/bigann-100M.json
@@ -0,0 +1,174 @@
+{
+  "dataset" : {
+    "name" : "bigann-100M",
+    "base_file" : "data/bigann-1B/base.1B.u8bin",
+    "subset_size" : 100000000,
+    "query_file" : "data/bigann-1B/query.public.10K.u8bin",
+    "distance" : "euclidean"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster5K-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "niter": 25,
+        "nlist": 5000,
+        "pq_dim": 64,
+        "ratio": 10
+      },
+      "file": "index/bigann-100M/raft_ivf_pq/dimpq64-cluster5K",
+      "search_params": [
+        {
+          "numProbes": 20,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 30,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 40,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 1000,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/bigann-100M/raft_ivf_pq/dimpq64-cluster5K-float-float"
+    },
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M36"
+    },
+
+
+    {
+      "name" : "ivf_flat.nlist100K",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist" : 100000,
+        "niter" : 25,
+        "ratio" : 5
+      },
+      "file" : "index/bigann-100M/ivf_flat/nlist100K",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":20},
+        {"max_batch":10000, "max_k":10, "nprobe":30},
+        {"max_batch":10000, "max_k":10, "nprobe":40},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/ivf_flat/nlist100K"
+    },
+
+
+
+  ]
+}
diff --git a/cpp/bench/ann/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json
new file mode 100644
index 0000000000..b3a945d50e
--- /dev/null
+++ b/cpp/bench/ann/conf/deep-100M.json
@@ -0,0 +1,223 @@
+{
+  "dataset" : {
+    "name" : "deep-100M",
+    "base_file" : "data/deep-1B/base.1B.fbin",
+    "subset_size" : 100000000,
+    "query_file" : "data/deep-1B/query.public.10K.fbin",
+    "distance" : "euclidean"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M36"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist50K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":50000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist50K"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist100K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":100000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist100K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist100K"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist200K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":200000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist200K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist200K"
+    },
+
+
+    {
+      "name" : "faiss_ivf_pq.M48-nlist16K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist16K",
+      "search_params" : [
+        {"nprobe":10},
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist16K"
+    },
+    {
+      "name" : "faiss_ivf_pq.M48-nlist50K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":50000, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist50K"
+    },
+    {
+      "name" : "faiss_ivf_pq.M48-nlist100K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":100000, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist100K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist100K"
+    },
+
+
+    {
+      "name" : "ivf_flat.nlist100K",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist" : 100000,
+        "niter" : 25,
+        "ratio" : 5
+      },
+      "file" : "index/deep-100M/ivf_flat/nlist100K",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":20},
+        {"max_batch":10000, "max_k":10, "nprobe":30},
+        {"max_batch":10000, "max_k":10, "nprobe":40},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/ivf_flat/nlist100K"
+    },
+
+
+  ]
+}
diff --git a/cpp/bench/ann/conf/deep-1B.json b/cpp/bench/ann/conf/deep-1B.json
new file mode 100644
index 0000000000..50d1b87602
--- /dev/null
+++ b/cpp/bench/ann/conf/deep-1B.json
@@ -0,0 +1,38 @@
+{
+  "dataset" : {
+    "name" : "deep-1B",
+    "base_file" : "data/deep-1B/base.1B.fbin",
+    "query_file" : "data/deep-1B/query.public.10K.fbin",
+    // although distance should be "euclidean", faiss becomes much slower for that
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "faiss_ivf_pq.M48-nlist50K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":50000, "M":48},
+      "file" : "index/deep-1B/faiss_ivf_pq/M48-nlist50K",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/deep-1B/faiss_ivf_pq/M48-nlist50K"
+    },
+
+
+  ]
+}
diff --git a/cpp/bench/ann/conf/glove-100-inner.json b/cpp/bench/ann/conf/glove-100-inner.json
new file mode 100644
index 0000000000..d210aca654
--- /dev/null
+++ b/cpp/bench/ann/conf/glove-100-inner.json
@@ -0,0 +1,797 @@
+{
+  "dataset" : {
+    "name" : "glove-100-inner",
+    "base_file" : "data/glove-100-inner/base.fbin",
+    "query_file" : "data/glove-100-inner/query.fbin",
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 1,
+    "k" : 10,
+    "run_count" : 3
+  },
+
+  "index" : [
+    {
+      "name" : "hnswlib.M4",
+      "algo" : "hnswlib",
+      "build_param": {"M":4, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M4",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M4"
+    },
+
+    {
+      "name" : "hnswlib.M8",
+      "algo" : "hnswlib",
+      "build_param": {"M":8, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M8",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M8"
+    },
+
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M12"
+    },
+
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M16"
+    },
+
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M24"
+    },
+
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M36"
+    },
+
+    {
+      "name" : "hnswlib.M48",
+      "algo" : "hnswlib",
+      "build_param": {"M":48, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M48",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M48"
+    },
+
+    {
+      "name" : "hnswlib.M64",
+      "algo" : "hnswlib",
+      "build_param": {"M":64, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M64",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M64"
+    },
+
+    {
+      "name" : "hnswlib.M96",
+      "algo" : "hnswlib",
+      "build_param": {"M":96, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M96",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M96"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist1024",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":1024},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist2048",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":2048},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist4096",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":4096},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist8192",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":8192},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist16384",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":16384},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist16384"
+    },
+
+
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist16384"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist16384"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist16384"
+    },
+
+
+    {
+      "name" : "faiss_ivf_sq.nlist1024-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":1024, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist2048-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":2048, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist4096-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":4096, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist8192-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":8192, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist16384-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":16384, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+
+
+    {
+      "name" : "faiss_ivf_sq.nlist1024-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":1024, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist2048-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":2048, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist4096-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":4096, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist8192-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":8192, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist16384-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":16384, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+
+    {
+      "name" : "faiss_flat",
+      "algo" : "faiss_gpu_flat",
+      "build_param": {},
+      "file" : "index/glove-100-inner/faiss_flat/flat",
+      "search_params" : [{}],
+      "search_result_file" : "result/glove-100-inner/faiss_flat/flat"
+    },
+
+    {
+      "name" : "ggnn.kbuild96-segment64-refine2-k10",
+      "algo" : "ggnn",
+      "build_param": {
+        "k_build": 96,
+        "segment_size": 64,
+        "refine_iterations": 2,
+        "dataset_size": 1183514,
+        "k": 10
+      },
+      "file" : "index/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10",
+      "search_params" : [
+        {"tau":0.001, "block_dim":64, "sorted_size":32},
+        {"tau":0.005, "block_dim":64, "sorted_size":32},
+        {"tau":0.01,  "block_dim":64, "sorted_size":32},
+        {"tau":0.02,  "block_dim":64, "sorted_size":32},
+        {"tau":0.03,  "block_dim":64, "sorted_size":32},
+        {"tau":0.04,  "block_dim":64, "sorted_size":32},
+        {"tau":0.05,  "block_dim":64, "sorted_size":32},
+        {"tau":0.06,  "block_dim":64, "sorted_size":32},
+        {"tau":0.09,  "block_dim":64, "sorted_size":32},
+        {"tau":0.12,  "block_dim":64, "sorted_size":32},
+        {"tau":0.18,  "block_dim":64, "sorted_size":32},
+        {"tau":0.21,  "block_dim":64, "sorted_size":32},
+        {"tau":0.24,  "block_dim":64, "sorted_size":32},
+        {"tau":0.27,  "block_dim":64, "sorted_size":32},
+        {"tau":0.3,   "block_dim":64, "sorted_size":32},
+        {"tau":0.4,   "block_dim":64, "sorted_size":32},
+        {"tau":0.01, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.02, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.03, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.04, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.05, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.06, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.09, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.12, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.18, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.21, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.24, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.27, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.3,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.4,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.5,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32}
+
+      ],
+      "search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10"
+    },
+
+
+  ]
+
+}
diff --git a/cpp/bench/ann/conf/sift-128-euclidean.json b/cpp/bench/ann/conf/sift-128-euclidean.json
new file mode 100644
index 0000000000..476c363ecd
--- /dev/null
+++ b/cpp/bench/ann/conf/sift-128-euclidean.json
@@ -0,0 +1,1321 @@
+{
+  "dataset": {
+    "name": "sift-128-euclidean",
+    "base_file": "/home/cjnolet/workspace/ann_data/sift-128-euclidean/base.fbin",
+    "query_file": "/home/cjnolet/workspace/ann_data/sift-128-euclidean/query.fbin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M36"
+    },
+
+
+
+      
+	  {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/sift-128-euclidean/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/sift-128-euclidean/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist16384"
+    }
+  ]
+}
diff --git a/cpp/bench/ann/scripts/eval.pl b/cpp/bench/ann/scripts/eval.pl
new file mode 100755
index 0000000000..81c5563d79
--- /dev/null
+++ b/cpp/bench/ann/scripts/eval.pl
@@ -0,0 +1,430 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+use File::Find;
+use Getopt::Std;
+
+my $QPS = 'QPS';
+my $AVG_LATENCY = 'avg_latency(ms)';
+my $P99_LATENCY = 'p99_latency(ms)';
+my $P999_LATENCY = 'p999_latency(ms)';
+my @CONDITIONS = ([$QPS, 2000], ['recall', 0.9], ['recall', 0.95]);
+
+
+my $USAGE = << 'END';
+usage: [-f] [-l avg|p99|p999] [-o output.csv] groundtruth.neighbors.ibin result_paths...
+  result_paths... are paths to the search result files.
+    Can specify multiple paths.
+    For each of them, if it's a directory, all the .txt files found under
+    it recursively will be regarded as inputs.
+
+  -f: force to recompute recall and update it in result file if needed
+  -l: output search latency rather than QPS. Available options:
+        "avg" for average latency;
+        "p99" for 99th percentile latency;
+        "p999" for 99.9th percentile latency.
+  -o: also write result to a csv file
+END
+
+
+my %opt;
+getopts('fl:o:', \%opt)
+  or die $USAGE;
+my $force_calc_recall = exists $opt{f} ? 1 : 0;
+my $csv_file;
+$csv_file = $opt{o} if exists $opt{o};
+my $metric = $QPS;
+if (exists $opt{l}) {
+    my $option = $opt{l};
+    if ($option eq 'avg') {
+        $metric = $AVG_LATENCY;
+    }
+    elsif ($option eq 'p99') {
+        $metric = $P99_LATENCY;
+    }
+    elsif ($option eq 'p999') {
+        $metric = $P999_LATENCY;
+    }
+    else {
+        die
+          "[error] illegal value for '-l': '$option'. Must be 'avg', 'p99' or 'p999'\n";
+    }
+}
+
+@ARGV >= 2
+  or die $USAGE;
+
+
+my $truth_file = shift @ARGV;
+my ($k, $dataset, $distance, $results) = get_all_results($metric, @ARGV);
+if (!defined $k) {
+    print STDERR "no result file found\n";
+    exit -1;
+}
+print STDERR "dataset = $dataset, distance = $distance, k = $k\n\n";
+calc_missing_recall($results, $truth_file, $force_calc_recall);
+
+my @results = sort {
+         $a->{name} cmp $b->{name}
+      or $a->{recall} <=> $b->{recall}
+      or $b->{qps} <=> $a->{qps}
+} @$results;
+printf("%-60s  %6s %16s  %s\n", '', 'Recall', $metric, 'search_param');
+for my $result (@results) {
+    my $fmt = ($metric eq $QPS) ? '%16.1f' : '%16.3f';
+    my $qps = $result->{qps};
+    $qps *= 1000 if $metric ne $QPS;    # the unit of latency is ms
+    printf("%-60s  %6.4f ${fmt}  %s\n",
+        $result->{name}, $result->{recall}, $qps, $result->{search_param});
+}
+if (defined $csv_file) {
+    open my $fh, '>', $csv_file;
+    print {$fh} ",Recall,${metric},search_param\n";
+    for my $result (@results) {
+        my $qps = $result->{qps};
+        $qps *= 1000 if $metric ne $QPS;
+        printf {$fh} (
+            "%s,%.4f,%.3f,%s\n", $result->{name}, $result->{recall},
+            $qps, $result->{search_param}
+        );
+    }
+}
+print "\n";
+calc_and_print_estimation($results, $metric, \@CONDITIONS);
+
+
+
+
+sub read_result {
+    my ($fname) = @_;
+    open my $fh, '<', $fname;
+    my %attr;
+    while (<$fh>) {
+        chomp;
+        next if /^\s*$/;
+        my $pos = index($_, ':');
+        $pos != -1
+          or die "[error] no ':' is found: '$_'\n";
+        my $key = substr($_, 0, $pos);
+        my $val = substr($_, $pos + 1);
+        $key =~ s/^\s+|\s+$//g;
+        $val =~ s/^\s+|\s+$//g;
+
+        # old version benchmark compatible
+        if ($key eq 'search_time') {
+            $key = 'average_search_time';
+            $val *= $attr{batch_size};
+        }
+        $attr{$key} = $val;
+    }
+    return \%attr;
+}
+
+sub overwrite_recall_to_result {
+    my ($fname, $recall) = @_;
+    open my $fh_in, '<', $fname;
+    $recall = sprintf("%f", $recall);
+    my $out;
+    while (<$fh_in>) {
+        s/^recall: .*/recall: $recall/;
+        $out .= $_;
+    }
+    close $fh_in;
+
+    open my $fh_out, '>', $fname;
+    print {$fh_out} $out;
+}
+
+sub append_recall_to_result {
+    my ($fname, $recall) = @_;
+    open my $fh, '>>', $fname;
+    printf {$fh} ("recall: %f\n", $recall);
+}
+
+sub get_all_results {
+    my ($metric) = shift @_;
+
+    my %fname;
+    my $wanted = sub {
+        if (-f && /\.txt$/) {
+            $fname{$File::Find::name} = 1;
+        }
+    };
+    find($wanted, @_);
+
+    my $k;
+    my $dataset;
+    my $distance;
+    my @results;
+    for my $f (sort keys %fname) {
+        print STDERR "reading $f ...\n";
+        my $attr = read_result($f);
+        if (!defined $k) {
+            $k = $attr->{k};
+            $dataset = $attr->{dataset};
+            $distance = $attr->{distance};
+        }
+        else {
+            $attr->{k} eq $k
+              or die "[error] k should be $k, but is $attr->{k} in $f\n";
+            $attr->{dataset} eq $dataset
+              or die
+              "[error] dataset should be $dataset, but is $attr->{dataset} in $f\n";
+            $attr->{distance} eq $distance
+              or die
+              "[error] distance should be $distance, but is $attr->{distance} in $f\n";
+        }
+
+        my $batch_size = $attr->{batch_size};
+        $batch_size =~ s/000000$/M/;
+        $batch_size =~ s/000$/K/;
+        my $search_param = $attr->{search_param};
+        $search_param =~ s/^{//;
+        $search_param =~ s/}$//;
+        $search_param =~ s/,/ /g;
+        $search_param =~ s/"//g;
+
+        my $qps;
+        if ($metric eq $QPS) {
+            $qps = $attr->{batch_size} / $attr->{average_search_time};
+        }
+        elsif ($metric eq $AVG_LATENCY) {
+            $qps = $attr->{average_search_time};
+        }
+        elsif ($metric eq $P99_LATENCY) {
+            exists $attr->{p99_search_time}
+              or die "[error] p99_search_time is not found\n";
+            $qps = $attr->{p99_search_time};
+        }
+        elsif ($metric eq $P999_LATENCY) {
+            exists $attr->{p999_search_time}
+              or die "[error] p999_search_time is not found\n";
+            $qps = $attr->{p999_search_time};
+        }
+        else {
+            die "[error] unknown latency type: '$metric'\n";
+        }
+        my $result = {
+            file => $f,
+            name => "$attr->{name}-batch${batch_size}",
+            search_param => $search_param,
+            qps => $qps,
+        };
+
+        if (exists $attr->{recall}) {
+            $result->{recall} = $attr->{recall};
+        }
+        push @results, $result;
+    }
+    return $k, $dataset, $distance, \@results;
+}
+
+sub read_ibin {
+    my ($fname) = @_;
+
+    open my $fh, '<:raw', $fname;
+    my $raw;
+
+    read($fh, $raw, 8);
+    my ($nrows, $dim) = unpack('LL', $raw);
+
+    my $expected_size = 8 + $nrows * $dim * 4;
+    my $size = (stat($fh))[7];
+    $size == $expected_size
+      or die(
+        "[error] expected size is $expected_size, but actual size is $size\n");
+
+    read($fh, $raw, $nrows * $dim * 4) == $nrows * $dim * 4
+      or die "[error] read $fname failed\n";
+    my @data = unpack('l' x ($nrows * $dim), $raw);
+    return \@data, $nrows, $dim;
+}
+
+sub pick_k_neighbors {
+    my ($neighbors, $nrows, $ncols, $k) = @_;
+
+    my @res;
+    for my $i (0 .. $nrows - 1) {
+        my %neighbor_set;
+        for my $j (0 .. $k - 1) {
+            $neighbor_set{$neighbors->[$i * $ncols + $j]} = 1;
+        }
+        push @res, \%neighbor_set;
+    }
+    return \@res;
+}
+
+
+sub calc_recall {
+    my ($truth_k_neighbors, $result_neighbors, $nrows, $k) = @_;
+
+    my $recall = 0;
+    for my $i (0 .. $nrows - 1) {
+        my $tp = 0;
+        for my $j (0 .. $k - 1) {
+            my $neighbor = $result_neighbors->[$i * $k + $j];
+            ++$tp if exists $truth_k_neighbors->[$i]{$neighbor};
+        }
+        $recall += $tp;
+    }
+    return $recall / $k / $nrows;
+}
+
+sub calc_missing_recall {
+    my ($results, $truth_file, $force_calc_recall) = @_;
+
+    my $need_calc_recall = grep { !exists $_->{recall} } @$results;
+    return unless $need_calc_recall || $force_calc_recall;
+
+    my ($truth_neighbors, $nrows, $truth_k) = read_ibin($truth_file);
+    $truth_k >= $k
+      or die "[error] ground truth k ($truth_k) < k($k)\n";
+    my $truth_k_neighbors =
+      pick_k_neighbors($truth_neighbors, $nrows, $truth_k, $k);
+
+    for my $result (@$results) {
+        next if exists $result->{recall} && !$force_calc_recall;
+
+        my $result_bin_file = $result->{file};
+        $result_bin_file =~ s/txt$/ibin/;
+        print STDERR "calculating recall for $result_bin_file ...\n";
+        my ($result_neighbors, $result_nrows, $result_k) =
+          read_ibin($result_bin_file);
+        $result_k == $k
+          or die
+          "[error] k should be $k, but is $result_k in $result_bin_file\n";
+        $result_nrows == $nrows
+          or die
+          "[error] #row should be $nrows, but is $result_nrows in $result_bin_file\n";
+
+        my $recall =
+          calc_recall($truth_k_neighbors, $result_neighbors, $nrows, $k);
+        if (exists $result->{recall}) {
+            my $new_value = sprintf("%f", $recall);
+            if ($result->{recall} ne $new_value) {
+                print "update recall: $result->{recall} -> $new_value\n";
+                overwrite_recall_to_result($result->{file}, $recall);
+            }
+        }
+        else {
+            append_recall_to_result($result->{file}, $recall);
+        }
+        $result->{recall} = $recall;
+    }
+}
+
+
+sub estimate {
+    my ($results, $condition, $value) = @_;
+    my %point_of;
+    for my $result (@$results) {
+        my $point;
+        if ($condition eq 'recall') {
+            $point = [$result->{recall}, $result->{qps}];
+        }
+        else {
+            $point = [$result->{qps}, $result->{recall}];
+        }
+        push @{$point_of{$result->{name}}}, $point;
+    }
+
+    my @names = sort keys %point_of;
+    my @result;
+    for my $name (@names) {
+        my @points = sort { $a->[0] <=> $b->[0] } @{$point_of{$name}};
+        if ($value < $points[0][0] || $value > $points[$#points][0]) {
+            push @result, -1;
+            next;
+        }
+        elsif ($value == $points[0][0]) {
+            push @result, $points[0][1];
+            next;
+        }
+
+        for my $i (1 .. $#points) {
+            if ($points[$i][0] >= $value) {
+                push @result,
+                  linear_interpolation($value, @{$points[$i - 1]},
+                    @{$points[$i]});
+                last;
+            }
+        }
+    }
+    return \@names, \@result;
+}
+
+sub linear_interpolation {
+    my ($x, $x1, $y1, $x2, $y2) = @_;
+    return $y1 + ($x - $x1) * ($y2 - $y1) / ($x2 - $x1);
+}
+
+sub merge {
+    my ($all, $new, $scale) = @_;
+    @$all == @$new
+      or die "[error] length is not equal\n";
+    for my $i (0 .. @$all - 1) {
+        push @{$all->[$i]}, $new->[$i] * $scale;
+    }
+}
+
+sub calc_and_print_estimation {
+    my ($results, $metric, $conditions) = @_;
+
+    my @conditions = grep {
+        my $target = $_->[0];
+        if ($target eq 'recall' || $target eq $metric) {
+            1;
+        }
+        else {
+                 $target eq $QPS
+              || $target eq $AVG_LATENCY
+              || $target eq $P99_LATENCY
+              || $target eq $P999_LATENCY
+              or die "[error] unknown condition: '$target'\n";
+            0;
+        }
+    } @$conditions;
+
+    my @headers = map {
+        my $header;
+        if ($_->[0] eq 'recall') {
+            $header = $metric . '@recall' . $_->[1];
+        }
+        elsif ($_->[0] eq $metric) {
+            $header = 'recall@' . $metric . $_->[1];
+        }
+        $header;
+    } @conditions;
+
+    my $scale = ($metric eq $QPS) ? 1 : 1000;
+    my $estimations;
+    for my $condition (@conditions) {
+        my ($names, $estimate) = estimate($results, @$condition);
+        if (!defined $estimations) {
+            @$estimations = map { [$_] } @$names;
+        }
+        merge($estimations, $estimate, $scale);
+    }
+
+    my $fmt = "%-60s" . ("  %16s" x @headers) . "\n";
+    printf($fmt, '', @headers);
+    $fmt =~ s/16s/16.4f/g;
+    for (@$estimations) {
+        printf($fmt, @$_);
+    }
+}
diff --git a/cpp/bench/ann/scripts/fbin_to_f16bin.py b/cpp/bench/ann/scripts/fbin_to_f16bin.py
new file mode 100755
index 0000000000..4ea8988d87
--- /dev/null
+++ b/cpp/bench/ann/scripts/fbin_to_f16bin.py
@@ -0,0 +1,46 @@
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import numpy as np
+
+
+def read_fbin(fname):
+    shape = np.fromfile(fname, dtype=np.uint32, count=2)
+    if float(shape[0]) * shape[1] * 4 > 2000000000:
+        data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape(
+            shape
+        )
+    else:
+        data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape)
+    return data
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if len(sys.argv) != 3:
+    print(
+        "usage: %s input.fbin output.f16bin" % (sys.argv[0]),
+        file=sys.stderr,
+    )
+    sys.exit(-1)
+
+data = read_fbin(sys.argv[1]).astype(np.float16)
+write_bin(sys.argv[2], data)
diff --git a/cpp/bench/ann/scripts/hdf5_to_fbin.py b/cpp/bench/ann/scripts/hdf5_to_fbin.py
new file mode 100755
index 0000000000..cfeb184ea8
--- /dev/null
+++ b/cpp/bench/ann/scripts/hdf5_to_fbin.py
@@ -0,0 +1,85 @@
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+import sys
+import numpy as np
+import h5py
+
+
+def normalize(x):
+    norm = np.linalg.norm(x, axis=1)
+    return (x.T / norm).T
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2 and len(sys.argv) != 3:
+        print(
+            "usage: %s [-n] <input>.hdf5\n" % (sys.argv[0]),
+            "  -n: normalize base/query set\n",
+            "outputs: <input>.base.fbin\n",
+            "         <input>.query.fbin\n",
+            "         <input>.groundtruth.neighbors.ibin\n",
+            "         <input>.groundtruth.distances.fbin",
+            file=sys.stderr,
+        )
+        sys.exit(-1)
+
+    need_normalize = False
+    if len(sys.argv) == 3:
+        assert sys.argv[1] == "-n"
+        need_normalize = True
+    fname_prefix = sys.argv[-1]
+    assert fname_prefix.endswith(".hdf5")
+    fname_prefix = fname_prefix[:-5]
+
+    hdf5 = h5py.File(sys.argv[-1], "r")
+    assert (
+        hdf5.attrs["distance"] == "angular"
+        or hdf5.attrs["distance"] == "euclidean"
+    )
+    assert hdf5["train"].dtype == np.float32
+    assert hdf5["test"].dtype == np.float32
+    assert hdf5["neighbors"].dtype == np.int32
+    assert hdf5["distances"].dtype == np.float32
+
+    base = hdf5["train"][:]
+    query = hdf5["test"][:]
+    if need_normalize:
+        base = normalize(base)
+        query = normalize(query)
+    elif hdf5.attrs["distance"] == "angular":
+        print(
+            "warning: input has angular distance, specify -n to normalize base/query set!\n"
+        )
+
+    output_fname = fname_prefix + ".base.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, base)
+
+    output_fname = fname_prefix + ".query.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, query)
+
+    output_fname = fname_prefix + ".groundtruth.neighbors.ibin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["neighbors"][:])
+
+    output_fname = fname_prefix + ".groundtruth.distances.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["distances"][:])
diff --git a/cpp/bench/ann/scripts/split_groundtruth.pl b/cpp/bench/ann/scripts/split_groundtruth.pl
new file mode 100755
index 0000000000..b0a59f806c
--- /dev/null
+++ b/cpp/bench/ann/scripts/split_groundtruth.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+
+
+@ARGV == 2
+  or die "usage: $0 input output_prefix\n";
+
+open my $fh, '<:raw', $ARGV[0];
+
+my $raw;
+read($fh, $raw, 8);
+my ($nrows, $dim) = unpack('LL', $raw);
+
+my $expected_size = 8 + $nrows * $dim * (4 + 4);
+my $size = (stat($fh))[7];
+$size == $expected_size
+  or die("error: expected size is $expected_size, but actual size is $size\n");
+
+
+open my $fh_out1, '>:raw', "$ARGV[1].neighbors.ibin";
+open my $fh_out2, '>:raw', "$ARGV[1].distances.fbin";
+
+print {$fh_out1} $raw;
+print {$fh_out2} $raw;
+
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out1} $raw;
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out2} $raw;
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
new file mode 100644
index 0000000000..8f73896e07
--- /dev/null
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -0,0 +1,88 @@
+
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+
+namespace raft::bench::ann {
+
+enum class Metric {
+  kInnerProduct,
+  kEuclidean,
+};
+
+enum class MemoryType {
+  Host,
+  HostMmap,
+  Device,
+};
+
+struct AlgoProperty {
+  MemoryType dataset_memory_type;
+  // neighbors/distances should have same memory type as queries
+  MemoryType query_memory_type;
+  bool need_dataset_when_search;
+};
+
+template <typename T>
+class ANN {
+ public:
+  struct AnnSearchParam {
+    virtual ~AnnSearchParam() = default;
+  };
+
+  ANN(Metric metric, int dim) : metric_(metric), dim_(dim) {}
+  virtual ~ANN() = default;
+
+  virtual void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) = 0;
+
+  virtual void set_search_param(const AnnSearchParam& param) = 0;
+  // TODO: this assumes that an algorithm can always return k results.
+  // This is not always possible.
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      size_t* neighbors,
+                      float* distances,
+                      cudaStream_t stream = 0) const = 0;
+
+  virtual void save(const std::string& file) const = 0;
+  virtual void load(const std::string& file)       = 0;
+
+  virtual AlgoProperty get_property() const = 0;
+
+  // Some algorithms don't save the building dataset in their indices.
+  // So they should be given the access to that dataset during searching.
+  // The advantage of this way is that index has smaller size
+  // and many indices can share one dataset.
+  //
+  // AlgoProperty::need_dataset_when_search of such algorithm should be true,
+  // and set_search_dataset() should save the passed-in pointer somewhere.
+  // The client code should call set_search_dataset() before searching,
+  // and should not release dataset before searching is finished.
+  virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){};
+
+ protected:
+  Metric metric_;
+  int dim_;
+};
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
new file mode 100644
index 0000000000..b4d8fbeee3
--- /dev/null
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef NVTX
+#include <nvToolsExt.h>
+#endif
+#include <unistd.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "benchmark_util.hpp"
+#include "conf.h"
+#include "dataset.h"
+#include "util.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::unordered_set;
+using std::vector;
+
+namespace raft::bench::ann {
+
+inline bool check_file_exist(const std::vector<string>& files)
+{
+  bool ret = true;
+  std::unordered_set<std::string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) == processed.end() && !file_exists(file)) {
+      log_error("file '%s' doesn't exist or is not a regular file", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+inline bool check_file_not_exist(const std::vector<std::string>& files, bool force_overwrite)
+{
+  bool ret = true;
+  for (const auto& file : files) {
+    if (file_exists(file)) {
+      if (force_overwrite) {
+        log_warn("'%s' already exists, will overwrite it", file.c_str());
+      } else {
+        log_error("'%s' already exists, use '-f' to force overwriting", file.c_str());
+        ret = false;
+      }
+    }
+  }
+  return ret;
+}
+
+inline bool check_no_duplicate_file(const std::vector<std::string>& files)
+{
+  bool ret = true;
+  std::unordered_set<string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) != processed.end()) {
+      log_error("'%s' occurs more than once as output file, would be overwritten", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+inline bool mkdir(const std::vector<std::string>& dirs)
+{
+  std::unordered_set<string> processed;
+  for (const auto& dir : dirs) {
+    if (processed.find(dir) == processed.end() && !dir_exists(dir)) {
+      if (create_dir(dir)) {
+        log_info("mkdir '%s'", dir.c_str());
+      } else {
+        log_error("fail to create output directory '%s'", dir.c_str());
+        // won't create any other dir when problem occurs
+        return false;
+      }
+    }
+    processed.insert(dir);
+  }
+  return true;
+}
+
+inline bool check(const std::vector<Configuration::Index>& indices,
+                  bool build_mode,
+                  bool force_overwrite)
+{
+  std::vector<std::string> files_should_exist;
+  std::vector<std::string> dirs_should_exist;
+  std::vector<std::string> output_files;
+  for (const auto& index : indices) {
+    if (build_mode) {
+      output_files.push_back(index.file);
+      output_files.push_back(index.file + ".txt");
+
+      auto pos = index.file.rfind('/');
+      if (pos != std::string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
+    } else {
+      files_should_exist.push_back(index.file);
+      files_should_exist.push_back(index.file + ".txt");
+
+      output_files.push_back(index.search_result_file + ".0.ibin");
+      output_files.push_back(index.search_result_file + ".0.txt");
+
+      auto pos = index.search_result_file.rfind('/');
+      if (pos != std::string::npos) {
+        dirs_should_exist.push_back(index.search_result_file.substr(0, pos));
+      }
+    }
+  }
+
+  bool ret = true;
+  if (!check_file_exist(files_should_exist)) { ret = false; }
+  if (!check_file_not_exist(output_files, force_overwrite)) { ret = false; }
+  if (!check_no_duplicate_file(output_files)) { ret = false; }
+  if (ret && !mkdir(dirs_should_exist)) { ret = false; }
+  return ret;
+}
+
+inline void write_build_info(const std::string& file_prefix,
+                             const std::string& dataset,
+                             const std::string& distance,
+                             const std::string& name,
+                             const std::string& algo,
+                             const std::string& build_param,
+                             float build_time)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "build_time: " << build_time << endl;
+  ofs.close();
+  if (!ofs) { throw std::runtime_error("can't write to build info file: " + file_prefix + ".txt"); }
+}
+
+template <typename T>
+void build(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
+{
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+  log_info(
+    "base set from dataset '%s', #vector = %zu", dataset->name().c_str(), dataset->base_set_size());
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    const T* base_set_ptr = nullptr;
+    if (algo_property.dataset_memory_type == MemoryType::Host) {
+      log_info("%s", "loading base set to memory");
+      base_set_ptr = dataset->base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+      log_info("%s", "mapping base set to memory");
+      base_set_ptr = dataset->mapped_base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+      log_info("%s", "loading base set to GPU");
+      base_set_ptr = dataset->base_set_on_gpu();
+    }
+
+    log_info("building index '%s'", index.name.c_str());
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#ifdef NVTX
+    nvtxRangePush("build");
+#endif
+    Timer timer;
+    algo->build(base_set_ptr, dataset->base_set_size(), stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+    nvtxRangePop();
+#endif
+    log_info("built index in %.2f seconds", elapsed_ms / 1000.0f);
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    algo->save(index.file);
+    write_build_info(index.file,
+                     dataset->name(),
+                     dataset->distance(),
+                     index.name,
+                     index.algo,
+                     index.build_param.dump(),
+                     elapsed_ms / 1000.0f);
+    log_info("saved index to %s", index.file.c_str());
+  }
+
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+inline void write_search_result(const std::string& file_prefix,
+                                const std::string& dataset,
+                                const std::string& distance,
+                                const std::string& name,
+                                const std::string& algo,
+                                const std::string& build_param,
+                                const std::string& search_param,
+                                int batch_size,
+                                int run_count,
+                                int k,
+                                float search_time_average,
+                                float search_time_p99,
+                                float search_time_p999,
+                                const int* neighbors,
+                                size_t query_set_size)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "search_param: " << search_param << "\n"
+      << "\n"
+      << "batch_size: " << batch_size << "\n"
+      << "run_count: " << run_count << "\n"
+      << "k: " << k << "\n"
+      << "average_search_time: " << search_time_average << endl;
+  if (search_time_p99 != std::numeric_limits<float>::max()) {
+    ofs << "p99_search_time: " << search_time_p99 << endl;
+  }
+  if (search_time_p999 != std::numeric_limits<float>::max()) {
+    ofs << "p999_search_time: " << search_time_p999 << endl;
+  }
+  ofs.close();
+  if (!ofs) {
+    throw std::runtime_error("can't write to search result file: " + file_prefix + ".txt");
+  }
+
+  BinFile<int> neighbors_file(file_prefix + ".ibin", "w");
+  neighbors_file.write(neighbors, query_set_size, k);
+}
+
+template <typename T>
+inline void search(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
+{
+  if (indices.empty()) { return; }
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+  log_info("loading query set from dataset '%s', #vector = %zu",
+           dataset->name().c_str(),
+           dataset->query_set_size());
+  const T* query_set = dataset->query_set();
+  // query set is usually much smaller than base set, so load it eagerly
+  const T* d_query_set  = dataset->query_set_on_gpu();
+  size_t query_set_size = dataset->query_set_size();
+
+  // currently all indices has same batch_size, k and run_count
+  const int batch_size = indices[0].batch_size;
+  const int k          = indices[0].k;
+  const int run_count  = indices[0].run_count;
+  log_info(
+    "basic search parameters: batch_size = %d, k = %d, run_count = %d", batch_size, k, run_count);
+  if (query_set_size % batch_size != 0) {
+    log_warn("query set size (%zu) % batch size (%d) != 0, the size of last batch is %zu",
+             query_set_size,
+             batch_size,
+             query_set_size % batch_size);
+  }
+  const size_t num_batches = (query_set_size - 1) / batch_size + 1;
+  std::size_t* neighbors   = new std::size_t[query_set_size * k];
+  int* neighbors_buf       = new int[query_set_size * k];
+  float* distances         = new float[query_set_size * k];
+  std::vector<float> search_times;
+  search_times.reserve(num_batches);
+  std::size_t* d_neighbors;
+  float* d_distances;
+  RAFT_CUDA_TRY(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
+  RAFT_CUDA_TRY(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str());
+    algo->load(index.file);
+
+    const T* this_query_set     = query_set;
+    std::size_t* this_neighbors = neighbors;
+    float* this_distances       = distances;
+    if (algo_property.query_memory_type == MemoryType::Device) {
+      this_query_set = d_query_set;
+      this_neighbors = d_neighbors;
+      this_distances = d_distances;
+    }
+
+    if (algo_property.need_dataset_when_search) {
+      log_info("loading base set from dataset '%s', #vector = %zu",
+               dataset->name().c_str(),
+               dataset->base_set_size());
+      const T* base_set_ptr = nullptr;
+      if (algo_property.dataset_memory_type == MemoryType::Host) {
+        log_info("%s", "loading base set to memory");
+        base_set_ptr = dataset->base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+        log_info("%s", "mapping base set to memory");
+        base_set_ptr = dataset->mapped_base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+        log_info("%s", "loading base set to GPU");
+        base_set_ptr = dataset->base_set_on_gpu();
+      }
+      algo->set_search_dataset(base_set_ptr, dataset->base_set_size());
+    }
+
+    for (int i = 0, end_i = index.search_params.size(); i != end_i; ++i) {
+      auto p_param = create_search_param<T>(index.algo, index.search_params[i]);
+      algo->set_search_param(*p_param);
+      log_info("search with param: %s", index.search_params[i].dump().c_str());
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        RAFT_CUDA_TRY(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
+        RAFT_CUDA_TRY(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
+      } else {
+        memset(neighbors, 0, query_set_size * k * sizeof(*neighbors));
+        memset(distances, 0, query_set_size * k * sizeof(*distances));
+      }
+
+      float best_search_time_average = std::numeric_limits<float>::max();
+      float best_search_time_p99     = std::numeric_limits<float>::max();
+      float best_search_time_p999    = std::numeric_limits<float>::max();
+      for (int run = 0; run < run_count; ++run) {
+        log_info("run %d / %d", run + 1, run_count);
+        for (std::size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
+          std::size_t row       = batch_id * batch_size;
+          int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
+          RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#ifdef NVTX
+          string nvtx_label = "batch" + to_string(batch_id);
+          if (run_count != 1) { nvtx_label = "run" + to_string(run) + "-" + nvtx_label; }
+          if (batch_id == 10) {
+            run = run_count - 1;
+            break;
+          }
+#endif
+          Timer timer;
+#ifdef NVTX
+          nvtxRangePush(nvtx_label.c_str());
+#endif
+          algo->search(this_query_set + row * dataset->dim(),
+                       actual_batch_size,
+                       k,
+                       this_neighbors + row * k,
+                       this_distances + row * k,
+                       stream);
+          RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+          float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+          nvtxRangePop();
+#endif
+          // If the size of the last batch is less than batch_size, don't count it for
+          // search time. But neighbors of the last batch will still be filled, so it's
+          // counted for recall calculation.
+          if (actual_batch_size == batch_size) {
+            search_times.push_back(elapsed_ms / 1000.0f);  // in seconds
+          }
+        }
+
+        float search_time_average =
+          std::accumulate(search_times.cbegin(), search_times.cend(), 0.0f) / search_times.size();
+        best_search_time_average = std::min(best_search_time_average, search_time_average);
+
+        if (search_times.size() >= 100) {
+          std::sort(search_times.begin(), search_times.end());
+
+          auto calc_percentile_pos = [](float percentile, size_t N) {
+            return static_cast<size_t>(std::ceil(percentile / 100.0 * N)) - 1;
+          };
+
+          float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())];
+          best_search_time_p99  = std::min(best_search_time_p99, search_time_p99);
+
+          if (search_times.size() >= 1000) {
+            float search_time_p999 = search_times[calc_percentile_pos(99.9, search_times.size())];
+            best_search_time_p999  = std::min(best_search_time_p999, search_time_p999);
+          }
+        }
+        search_times.clear();
+      }
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        RAFT_CUDA_TRY(cudaMemcpy(neighbors,
+                                 d_neighbors,
+                                 query_set_size * k * sizeof(*d_neighbors),
+                                 cudaMemcpyDeviceToHost));
+        RAFT_CUDA_TRY(cudaMemcpy(distances,
+                                 d_distances,
+                                 query_set_size * k * sizeof(*d_distances),
+                                 cudaMemcpyDeviceToHost));
+      }
+
+      for (size_t j = 0; j < query_set_size * k; ++j) {
+        neighbors_buf[j] = neighbors[j];
+      }
+      write_search_result(index.search_result_file + "." + to_string(i),
+                          dataset->name(),
+                          dataset->distance(),
+                          index.name,
+                          index.algo,
+                          index.build_param.dump(),
+                          index.search_params[i].dump(),
+                          batch_size,
+                          index.run_count,
+                          k,
+                          best_search_time_average,
+                          best_search_time_p99,
+                          best_search_time_p999,
+                          neighbors_buf,
+                          query_set_size);
+    }
+
+    log_info("finish searching for index '%s'", index.name.c_str());
+  }
+
+  delete[] neighbors;
+  delete[] neighbors_buf;
+  delete[] distances;
+  RAFT_CUDA_TRY(cudaFree(d_neighbors));
+  RAFT_CUDA_TRY(cudaFree(d_distances));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+inline const std::string usage(const string& argv0)
+{
+  return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
+         "   -b: build mode, will build index\n" +
+         "   -s: search mode, will search using built index\n" +
+         "       one and only one of -b and -s should be specified\n" +
+         "   -c: just check command line options and conf.json are sensible\n" +
+         "       won't build or search\n" + "   -f: force overwriting existing output files\n" +
+         "   -i: by default will build/search all the indices found in conf.json\n" +
+         "       '-i' can be used to select a subset of indices\n" +
+         "       'index_names' is a list of comma-separated index names\n" +
+         "       '*' is allowed as the last character of a name to select all matched indices\n" +
+         "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
+}
+
+template <typename T>
+inline int dispatch_benchmark(Configuration& conf,
+                              std::string& index_patterns,
+                              bool force_overwrite,
+                              bool only_check,
+                              bool build_mode,
+                              bool search_mode)
+{
+  try {
+    auto dataset_conf = conf.get_dataset_conf();
+
+    BinDataset<T> dataset(dataset_conf.name,
+                          dataset_conf.base_file,
+                          dataset_conf.subset_first_row,
+                          dataset_conf.subset_size,
+                          dataset_conf.query_file,
+                          dataset_conf.distance);
+
+    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
+    if (!check(indices, build_mode, force_overwrite)) { return -1; }
+
+    std::string message = "will ";
+    message += build_mode ? "build:" : "search:";
+    for (const auto& index : indices) {
+      message += "\n  " + index.name;
+    }
+    log_info("%s", message.c_str());
+
+    if (only_check) {
+      log_info("%s", "all check passed, quit due to option -c");
+      return 0;
+    }
+
+    if (build_mode) {
+      build(&dataset, indices);
+    } else if (search_mode) {
+      search(&dataset, indices);
+    }
+  } catch (const std::exception& e) {
+    log_error("exception occurred: %s", e.what());
+    return -1;
+  }
+
+  return 0;
+}
+
+inline int run_main(int argc, char** argv)
+{
+  bool force_overwrite = false;
+  bool build_mode      = false;
+  bool search_mode     = false;
+  bool only_check      = false;
+  std::string index_patterns("*");
+
+  int opt;
+  while ((opt = getopt(argc, argv, "bscfi:h")) != -1) {
+    switch (opt) {
+      case 'b': build_mode = true; break;
+      case 's': search_mode = true; break;
+      case 'c': only_check = true; break;
+      case 'f': force_overwrite = true; break;
+      case 'i': index_patterns = optarg; break;
+      case 'h': cout << usage(argv[0]) << endl; return -1;
+      default: cerr << "\n" << usage(argv[0]) << endl; return -1;
+    }
+  }
+  if (build_mode == search_mode) {
+    std::cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
+    return -1;
+  }
+  if (argc - optind != 1) {
+    std::cerr << usage(argv[0]) << endl;
+    return -1;
+  }
+  string conf_file = argv[optind];
+
+  std::ifstream conf_stream(conf_file.c_str());
+  if (!conf_stream) {
+    log_error("can't open configuration file: %s", argv[optind]);
+    return -1;
+  }
+
+  try {
+    Configuration conf(conf_stream);
+    std::string dtype = conf.get_dataset_conf().dtype;
+
+    if (dtype == "float") {
+      return dispatch_benchmark<float>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else if (dtype == "uint8") {
+      return dispatch_benchmark<std::uint8_t>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else if (dtype == "int8") {
+      return dispatch_benchmark<std::int8_t>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else {
+      log_error("datatype %s not supported", dtype);
+    }
+
+  } catch (const std::exception& e) {
+    log_error("exception occurred: %s", e.what());
+    return -1;
+  }
+
+  return -1;
+}
+};  // namespace raft::bench::ann
diff --git a/cpp/src/distance/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/bench/ann/src/common/benchmark_util.hpp
similarity index 51%
rename from cpp/src/distance/distance/specializations/detail/l1_float_float_float_int.cu
rename to cpp/bench/ann/src/common/benchmark_util.hpp
index 7b45e52ca1..7005883ffc 100644
--- a/cpp/src/distance/distance/specializations/detail/l1_float_float_float_int.cu
+++ b/cpp/bench/ann/src/common/benchmark_util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,26 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
+#include "ann_types.hpp"
+#include <string>
 
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L1, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::bench::ann {
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+inline Metric parse_metric(const std::string& metric_str)
+{
+  if (metric_str == "inner_product") {
+    return raft::bench::ann::Metric::kInnerProduct;
+  } else if (metric_str == "euclidean") {
+    return raft::bench::ann::Metric::kEuclidean;
+  } else {
+    throw std::runtime_error("invalid metric: '" + metric_str + "'");
+  }
+}
+};  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/common/conf.cpp b/cpp/bench/ann/src/common/conf.cpp
new file mode 100644
index 0000000000..f690f68783
--- /dev/null
+++ b/cpp/bench/ann/src/common/conf.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "conf.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "util.h"
+
+namespace raft::bench::ann {
+using std::runtime_error;
+using std::string;
+using std::unordered_set;
+using std::vector;
+
+Configuration::Configuration(std::istream& conf_stream)
+{
+  // to enable comments in json
+  auto conf = nlohmann::json::parse(conf_stream, nullptr, true, true);
+
+  parse_dataset_(conf.at("dataset"));
+  parse_index_(conf.at("index"), conf.at("search_basic_param"));
+}
+
+vector<Configuration::Index> Configuration::get_indices(const string& patterns) const
+{
+  vector<string> names;
+  for (const auto& index : indices_) {
+    names.push_back(index.name);
+  }
+
+  auto matched = match_(names, patterns);
+  if (matched.empty()) { throw runtime_error("no available index matches '" + patterns + "'"); }
+
+  vector<Index> res;
+  for (const auto& index : indices_) {
+    if (matched.find(index.name) != matched.end()) { res.push_back(index); }
+  }
+  return res;
+}
+
+void Configuration::parse_dataset_(const nlohmann::json& conf)
+{
+  dataset_conf_.name       = conf.at("name");
+  dataset_conf_.base_file  = conf.at("base_file");
+  dataset_conf_.query_file = conf.at("query_file");
+  dataset_conf_.distance   = conf.at("distance");
+
+  if (conf.contains("subset_first_row")) {
+    dataset_conf_.subset_first_row = conf.at("subset_first_row");
+  }
+  if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); }
+
+  if (conf.contains("dtype")) {
+    dataset_conf_.dtype = conf.at("dtype");
+  } else {
+    auto filename = dataset_conf_.base_file;
+    if (!filename.compare(filename.size() - 4, 4, "fbin")) {
+      dataset_conf_.dtype = "float";
+    } else if (!filename.compare(filename.size() - 5, 5, "u8bin")) {
+      dataset_conf_.dtype = "uint8";
+    } else if (!filename.compare(filename.size() - 5, 5, "i8bin")) {
+      dataset_conf_.dtype = "int8";
+    } else {
+      log_error("Could not determine data type of the dataset");
+    }
+  }
+}
+
+void Configuration::parse_index_(const nlohmann::json& index_conf,
+                                 const nlohmann::json& search_basic_conf)
+{
+  const int batch_size = search_basic_conf.at("batch_size");
+  const int k          = search_basic_conf.at("k");
+  const int run_count  = search_basic_conf.at("run_count");
+
+  for (const auto& conf : index_conf) {
+    Index index;
+    index.name        = conf.at("name");
+    index.algo        = conf.at("algo");
+    index.build_param = conf.at("build_param");
+    index.file        = conf.at("file");
+    index.batch_size  = batch_size;
+    index.k           = k;
+    index.run_count   = run_count;
+
+    if (conf.contains("multigpu")) {
+      for (auto it : conf.at("multigpu")) {
+        index.dev_list.push_back(it);
+      }
+      if (index.dev_list.empty()) { throw std::runtime_error("dev_list shouln't be empty!"); }
+      index.dev_list.shrink_to_fit();
+      index.build_param["multigpu"] = conf["multigpu"];
+    }
+
+    if (conf.contains("refine_ratio")) {
+      float refine_ratio = conf.at("refine_ratio");
+      if (refine_ratio <= 1.0f) {
+        throw runtime_error("'" + index.name + "': refine_ratio should > 1.0");
+      }
+      index.refine_ratio = refine_ratio;
+    }
+
+    for (const auto& param : conf.at("search_params")) {
+      index.search_params.push_back(param);
+    }
+    index.search_result_file = conf.at("search_result_file");
+
+    indices_.push_back(index);
+  }
+}
+
+unordered_set<string> Configuration::match_(const vector<string>& candidates,
+                                            const string& patterns) const
+{
+  unordered_set<string> matched;
+  for (const auto& pat : split(patterns, ',')) {
+    if (pat.empty()) { continue; }
+
+    if (pat.back() == '*') {
+      auto len = pat.size() - 1;
+      for (const auto& item : candidates) {
+        if (item.compare(0, len, pat, 0, len) == 0) { matched.insert(item); }
+      }
+    } else {
+      for (const auto& item : candidates) {
+        if (item == pat) { matched.insert(item); }
+      }
+    }
+  }
+
+  return matched;
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/conf.h b/cpp/bench/ann/src/common/conf.h
new file mode 100644
index 0000000000..845defe94a
--- /dev/null
+++ b/cpp/bench/ann/src/common/conf.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <iostream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+class Configuration {
+ public:
+  struct Index {
+    std::string name;
+    std::string algo;
+    nlohmann::json build_param;
+    std::string file;
+    std::vector<int> dev_list;
+
+    int batch_size;
+    int k;
+    int run_count;
+    std::vector<nlohmann::json> search_params;
+    std::string search_result_file;
+    float refine_ratio{0.0f};
+  };
+
+  struct DatasetConf {
+    std::string name;
+    std::string base_file;
+    // use only a subset of base_file,
+    // the range of rows is [subset_first_row, subset_first_row + subset_size)
+    // however, subset_size = 0 means using all rows after subset_first_row
+    // that is, the subset is [subset_first_row, #rows in base_file)
+    size_t subset_first_row{0};
+    size_t subset_size{0};
+    std::string query_file;
+    std::string distance;
+
+    // data type of input dataset, possible values ["float", "int8", "uint8"]
+    std::string dtype;
+  };
+
+  Configuration(std::istream& conf_stream);
+
+  DatasetConf get_dataset_conf() const { return dataset_conf_; }
+  std::vector<Index> get_indices(const std::string& patterns) const;
+
+ private:
+  void parse_dataset_(const nlohmann::json& conf);
+  void parse_index_(const nlohmann::json& index_conf, const nlohmann::json& search_basic_conf);
+  std::unordered_set<std::string> match_(const std::vector<std::string>& candidates,
+                                         const std::string& patterns) const;
+
+  DatasetConf dataset_conf_;
+  std::vector<Index> indices_;
+};
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/dataset.h b/cpp/bench/ann/src/common/dataset.h
new file mode 100644
index 0000000000..1244935c99
--- /dev/null
+++ b/cpp/bench/ann/src/common/dataset.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda_fp16.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+// http://big-ann-benchmarks.com/index.html:
+// binary format that starts with 8 bytes of data consisting of num_points(uint32_t)
+// num_dimensions(uint32) followed by num_pts x num_dimensions x sizeof(type) bytes of
+// data stored one vector after another.
+// Data files will have suffixes .fbin, .u8bin, and .i8bin to represent float32, uint8
+// and int8 type data.
+// As extensions for this benchmark, half and int data files will have suffixes .f16bin
+// and .ibin, respectively.
+template <typename T>
+class BinFile {
+ public:
+  BinFile(const std::string& file,
+          const std::string& mode,
+          uint32_t subset_first_row = 0,
+          uint32_t subset_size      = 0);
+  ~BinFile() { fclose(fp_); }
+  BinFile(const BinFile&) = delete;
+  BinFile& operator=(const BinFile&) = delete;
+
+  void get_shape(size_t* nrows, int* ndims)
+  {
+    assert(read_mode_);
+    *nrows = nrows_;
+    *ndims = ndims_;
+  }
+
+  void read(T* data) const
+  {
+    assert(read_mode_);
+    size_t total = static_cast<size_t>(nrows_) * ndims_;
+    if (fread(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error("fread() BinFile " + file_ + " failed");
+    }
+  }
+
+  void write(const T* data, uint32_t nrows, uint32_t ndims)
+  {
+    assert(!read_mode_);
+    if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+    if (fwrite(&ndims, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+
+    size_t total = static_cast<size_t>(nrows) * ndims;
+    if (fwrite(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+  }
+
+  void* map() const
+  {
+    assert(read_mode_);
+    int fid       = fileno(fp_);
+    auto mmap_ptr = mmap(NULL, file_size_, PROT_READ, MAP_PRIVATE, fid, 0);
+    if (mmap_ptr == MAP_FAILED) {
+      throw std::runtime_error("mmap error: Value of errno " + std::to_string(errno) + ", " +
+                               std::string(strerror(errno)));
+    }
+    return mmap_ptr;
+  }
+
+  void unmap(void* data) const
+  {
+    if (munmap(data, file_size_) == -1) {
+      throw std::runtime_error("munmap error: " + std::string(strerror(errno)));
+    }
+  }
+
+ private:
+  void check_suffix_();
+
+  std::string file_;
+  FILE* fp_;
+  bool read_mode_;
+  uint32_t nrows_;
+  uint32_t ndims_;
+  size_t file_size_;
+};
+
+template <typename T>
+BinFile<T>::BinFile(const std::string& file,
+                    const std::string& mode,
+                    uint32_t subset_first_row,
+                    uint32_t subset_size)
+  : file_(file)
+{
+  check_suffix_();
+
+  if (mode == "r") {
+    read_mode_ = true;
+  } else if (mode == "w") {
+    read_mode_ = false;
+    if (subset_first_row != 0) {
+      throw std::runtime_error("subset_first_row should be zero for write mode");
+    }
+    if (subset_size != 0) { throw std::runtime_error("subset_size should be zero for write mode"); }
+  } else {
+    throw std::runtime_error("BinFile's mode must be either 'r' or 'w': " + file_);
+  }
+
+  fp_ = fopen(file_.c_str(), mode.c_str());
+  if (!fp_) { throw std::runtime_error("open BinFile failed: " + file_); }
+
+  if (read_mode_) {
+    struct stat statbuf;
+    if (stat(file_.c_str(), &statbuf) != 0) { throw std::runtime_error("stat() failed: " + file_); }
+    file_size_ = statbuf.st_size;
+
+    uint32_t header[2];
+    if (fread(header, sizeof(uint32_t), 2, fp_) != 2) {
+      throw std::runtime_error("read header of BinFile failed: " + file_);
+    }
+    nrows_ = header[0];
+    ndims_ = header[1];
+
+    size_t expected_file_size =
+      2 * sizeof(uint32_t) + static_cast<size_t>(nrows_) * ndims_ * sizeof(T);
+    if (file_size_ != expected_file_size) {
+      throw std::runtime_error("expected file size of " + file_ + " is " +
+                               std::to_string(expected_file_size) + ", however, actual size is " +
+                               std::to_string(file_size_));
+    }
+
+    if (subset_first_row >= nrows_) {
+      throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) +
+                               ") >= nrows (" + std::to_string(nrows_) + ")");
+    }
+    if (subset_first_row + subset_size > nrows_) {
+      throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) +
+                               ") + subset_size (" + std::to_string(subset_size) + ") > nrows (" +
+                               std::to_string(nrows_) + ")");
+    }
+
+    if (subset_first_row) {
+      static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset");
+      if (fseek(fp_, sizeof(T) * subset_first_row * ndims_, SEEK_CUR) == -1) {
+        throw std::runtime_error(file_ + ": fseek failed");
+      }
+      nrows_ -= subset_first_row;
+    }
+    if (subset_size) { nrows_ = subset_size; }
+  }
+}
+
+template <typename T>
+void BinFile<T>::check_suffix_()
+{
+  auto pos = file_.rfind('.');
+  if (pos == std::string::npos) {
+    throw std::runtime_error("name of BinFile doesn't have a suffix: " + file_);
+  }
+  std::string suffix = file_.substr(pos + 1);
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (suffix != "fbin") {
+      throw std::runtime_error("BinFile<float> should has .fbin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, half>) {
+    if (suffix != "f16bin") {
+      throw std::runtime_error("BinFile<half> should has .f16bin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, int>) {
+    if (suffix != "ibin") {
+      throw std::runtime_error("BinFile<int> should has .ibin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    if (suffix != "u8bin") {
+      throw std::runtime_error("BinFile<uint8_t> should has .u8bin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    if (suffix != "i8bin") {
+      throw std::runtime_error("BinFile<int8_t> should has .i8bin suffix: " + file_);
+    }
+  } else {
+    throw std::runtime_error(
+      "T of BinFile<T> should be one of float, half, int, uint8_t, or int8_t");
+  }
+}
+
+template <typename T>
+class Dataset {
+ public:
+  Dataset(const std::string& name) : name_(name) {}
+  Dataset(const std::string& name, const std::string& distance) : name_(name), distance_(distance)
+  {
+  }
+  Dataset(const Dataset&) = delete;
+  Dataset& operator=(const Dataset&) = delete;
+  virtual ~Dataset();
+
+  std::string name() const { return name_; }
+  std::string distance() const { return distance_; }
+  int dim() const { return dim_; }
+  size_t base_set_size() const { return base_set_size_; }
+  size_t query_set_size() const { return query_set_size_; }
+
+  // load data lazily, so don't pay the overhead of reading unneeded set
+  // e.g. don't load base set when searching
+  const T* base_set() const
+  {
+    if (!base_set_) { load_base_set_(); }
+    return base_set_;
+  }
+
+  const T* query_set() const
+  {
+    if (!query_set_) { load_query_set_(); }
+    return query_set_;
+  }
+
+  const T* base_set_on_gpu() const;
+  const T* query_set_on_gpu() const;
+  const T* mapped_base_set() const;
+
+ protected:
+  virtual void load_base_set_() const  = 0;
+  virtual void load_query_set_() const = 0;
+  virtual void map_base_set_() const   = 0;
+
+  std::string name_;
+  std::string distance_;
+  int dim_;
+  size_t base_set_size_;
+  size_t query_set_size_;
+
+  mutable T* base_set_        = nullptr;
+  mutable T* query_set_       = nullptr;
+  mutable T* d_base_set_      = nullptr;
+  mutable T* d_query_set_     = nullptr;
+  mutable T* mapped_base_set_ = nullptr;
+};
+
+template <typename T>
+Dataset<T>::~Dataset()
+{
+  delete[] base_set_;
+  delete[] query_set_;
+  if (d_base_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_base_set_)); }
+  if (d_query_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_query_set_)); }
+}
+
+template <typename T>
+const T* Dataset<T>::base_set_on_gpu() const
+{
+  if (!d_base_set_) {
+    base_set();
+    RAFT_CUDA_TRY(cudaMalloc((void**)&d_base_set_, base_set_size_ * dim_ * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMemcpy(
+      d_base_set_, base_set_, base_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return d_base_set_;
+}
+
+template <typename T>
+const T* Dataset<T>::query_set_on_gpu() const
+{
+  if (!d_query_set_) {
+    query_set();
+    RAFT_CUDA_TRY(cudaMalloc((void**)&d_query_set_, query_set_size_ * dim_ * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMemcpy(
+      d_query_set_, query_set_, query_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return d_query_set_;
+}
+
+template <typename T>
+const T* Dataset<T>::mapped_base_set() const
+{
+  if (!mapped_base_set_) { map_base_set_(); }
+  return mapped_base_set_;
+}
+
+template <typename T>
+class BinDataset : public Dataset<T> {
+ public:
+  BinDataset(const std::string& name,
+             const std::string& base_file,
+             size_t subset_first_row,
+             size_t subset_size,
+             const std::string& query_file,
+             const std::string& distance);
+  ~BinDataset()
+  {
+    if (this->mapped_base_set_) {
+      base_file_.unmap(reinterpret_cast<char*>(this->mapped_base_set_) - subset_offset_);
+    }
+  }
+
+ private:
+  void load_base_set_() const override;
+  void load_query_set_() const override;
+  void map_base_set_() const override;
+
+  using Dataset<T>::dim_;
+  using Dataset<T>::base_set_size_;
+  using Dataset<T>::query_set_size_;
+
+  BinFile<T> base_file_;
+  BinFile<T> query_file_;
+
+  size_t subset_offset_;
+};
+
+template <typename T>
+BinDataset<T>::BinDataset(const std::string& name,
+                          const std::string& base_file,
+                          size_t subset_first_row,
+                          size_t subset_size,
+                          const std::string& query_file,
+                          const std::string& distance)
+  : Dataset<T>(name, distance),
+    base_file_(base_file, "r", subset_first_row, subset_size),
+    query_file_(query_file, "r"),
+    subset_offset_(2 * sizeof(uint32_t) + subset_first_row * dim_ * sizeof(T))
+{
+  base_file_.get_shape(&base_set_size_, &dim_);
+  int query_dim;
+  query_file_.get_shape(&query_set_size_, &query_dim);
+  if (query_dim != dim_) {
+    throw std::runtime_error("base set dim (" + std::to_string(dim_) + ") != query set dim (" +
+                             std::to_string(query_dim));
+  }
+}
+
+template <typename T>
+void BinDataset<T>::load_base_set_() const
+{
+  this->base_set_ = new T[base_set_size_ * dim_];
+  base_file_.read(this->base_set_);
+}
+
+template <typename T>
+void BinDataset<T>::load_query_set_() const
+{
+  this->query_set_ = new T[query_set_size_ * dim_];
+  query_file_.read(this->query_set_);
+}
+
+template <typename T>
+void BinDataset<T>::map_base_set_() const
+{
+  char* original_map_ptr = static_cast<char*>(base_file_.map());
+  this->mapped_base_set_ = reinterpret_cast<T*>(original_map_ptr + subset_offset_);
+}
+
+}  // namespace  raft::bench::ann
diff --git a/cpp/bench/ann/src/common/util.cpp b/cpp/bench/ann/src/common/util.cpp
new file mode 100644
index 0000000000..17636f76d7
--- /dev/null
+++ b/cpp/bench/ann/src/common/util.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "util.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <cstring>
+#include <sstream>
+
+namespace raft::bench::ann {
+
+std::vector<std::string> split(const std::string& s, char delimiter)
+{
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream iss(s);
+  while (getline(iss, token, delimiter)) {
+    if (!token.empty()) { tokens.push_back(token); }
+  }
+  return tokens;
+}
+
+bool file_exists(const std::string& filename)
+{
+  struct stat statbuf;
+  if (stat(filename.c_str(), &statbuf) != 0) { return false; }
+  return S_ISREG(statbuf.st_mode);
+}
+
+bool dir_exists(const std::string& dir)
+{
+  struct stat statbuf;
+  if (stat(dir.c_str(), &statbuf) != 0) { return false; }
+  return S_ISDIR(statbuf.st_mode);
+}
+
+bool create_dir(const std::string& dir)
+{
+  const auto path = split(dir, '/');
+
+  std::string cwd;
+  if (!dir.empty() && dir[0] == '/') { cwd += '/'; }
+
+  for (const auto& p : path) {
+    cwd += p + "/";
+    if (!dir_exists(cwd)) {
+      int ret = mkdir(cwd.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+      if (ret != 0) { return false; }
+    }
+  }
+  return true;
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/util.h b/cpp/bench/ann/src/common/util.h
new file mode 100644
index 0000000000..290bf4cea9
--- /dev/null
+++ b/cpp/bench/ann/src/common/util.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <chrono>
+#include <cstdio>
+#include <ctime>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace raft::bench::ann {
+
+class Timer {
+ public:
+  Timer() { reset(); }
+  void reset() { start_time_ = std::chrono::steady_clock::now(); }
+  float elapsed_ms()
+  {
+    auto end_time = std::chrono::steady_clock::now();
+    auto dur =
+      std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(end_time - start_time_);
+    return dur.count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_time_;
+};
+
+std::vector<std::string> split(const std::string& s, char delimiter);
+
+bool file_exists(const std::string& filename);
+bool dir_exists(const std::string& dir);
+bool create_dir(const std::string& dir);
+
+template <typename... Ts>
+void log_(const char* level, Ts... vs)
+{
+  char buf[20];
+  std::time_t now = std::time(nullptr);
+  std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+  printf("%s [%s] ", buf, level);
+  printf(vs...);
+  printf("\n");
+  fflush(stdout);
+}
+
+template <typename... Ts>
+void log_info(Ts... vs)
+{
+  log_("info", vs...);
+}
+
+template <typename... Ts>
+void log_warn(Ts... vs)
+{
+  log_("warn", vs...);
+}
+
+template <typename... Ts>
+void log_error(Ts... vs)
+{
+  log_("error", vs...);
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
new file mode 100644
index 0000000000..294da9a14f
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann_types.hpp"
+#undef WARP_SIZE
+#include "faiss_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  param.M     = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.usePrecomputed = conf.at("usePrecomputed");
+  } else {
+    param.usePrecomputed = false;
+  }
+  if (conf.contains("useFloat16")) {
+    param.useFloat16 = conf.at("useFloat16");
+  } else {
+    param.useFloat16 = false;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
+{
+  param.nlist          = conf.at("nlist");
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::FaissGpu<T>::SearchParam& param)
+{
+  param.nprobe = conf.at("nprobe");
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "faiss_gpu_ivf_flat") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
+    } else if (algo == "faiss_gpu_ivf_pq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFPQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_ivf_sq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFSQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_flat") {
+      ann = std::make_unique<raft::bench::ann::FaissGpuFlat<T>>(metric, dim);
+    }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
+    auto param = std::make_unique<typename raft::bench::ann::FaissGpu<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo == "faiss_gpu_flat") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
new file mode 100644
index 0000000000..8cfc26ea5b
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FAISS_WRAPPER_H_
+#define FAISS_WRAPPER_H_
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/index_io.h>
+#include <omp.h>
+
+#include <cassert>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+#include "../common/benchmark_util.hpp"
+#include <raft/util/cudart_utils.hpp>
+
+namespace {
+
+faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric)
+{
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
+    return faiss::METRIC_INNER_PRODUCT;
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
+    return faiss::METRIC_L2;
+  } else {
+    throw std::runtime_error("faiss supports only metric type of inner product and L2");
+  }
+}
+
+// note BLAS library can still use multi-threading, and
+// setting environment variable like OPENBLAS_NUM_THREADS can control it
+class OmpSingleThreadScope {
+ public:
+  OmpSingleThreadScope()
+  {
+    max_threads_ = omp_get_max_threads();
+    omp_set_num_threads(1);
+  }
+  ~OmpSingleThreadScope()
+  {
+    // the best we can do
+    omp_set_num_threads(max_threads_);
+  }
+
+ private:
+  int max_threads_;
+};
+
+}  // namespace
+
+namespace raft::bench::ann {
+
+template <typename T>
+class FaissGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int nprobe;
+  };
+
+  FaissGpu(Metric metric, int dim, int nlist);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const final;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    // to enable building big dataset which is larger than GPU memory
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+
+ protected:
+  template <typename GpuIndex, typename CpuIndex>
+  void save_(const std::string& file) const;
+
+  template <typename GpuIndex, typename CpuIndex>
+  void load_(const std::string& file);
+
+  mutable faiss::gpu::StandardGpuResources gpu_resource_;
+  std::unique_ptr<faiss::gpu::GpuIndex> index_;
+  faiss::MetricType metric_type_;
+  int nlist_;
+  int device_;
+};
+
+template <typename T>
+FaissGpu<T>::FaissGpu(Metric metric, int dim, int nlist)
+  : ANN<T>(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist)
+{
+  static_assert(std::is_same_v<T, float>, "faiss support only float type");
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void FaissGpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  gpu_resource_.setDefaultStream(device_, stream);
+  index_->train(nrow, dataset);  // faiss::gpu::GpuIndexFlat::train() will do nothing
+  assert(index_->is_trained);
+  index_->add(nrow, dataset);
+}
+
+template <typename T>
+void FaissGpu<T>::set_search_param(const AnnSearchParam& param)
+{
+  int nprobe = dynamic_cast<const SearchParam&>(param).nprobe;
+  assert(nprobe <= nlist_);
+  dynamic_cast<faiss::gpu::GpuIndexIVF*>(index_.get())->setNumProbes(nprobe);
+}
+
+template <typename T>
+void FaissGpu<T>::search(const T* queries,
+                         int batch_size,
+                         int k,
+                         size_t* neighbors,
+                         float* distances,
+                         cudaStream_t stream) const
+{
+  static_assert(sizeof(size_t) == sizeof(faiss::Index::idx_t),
+                "sizes of size_t and faiss::Index::idx_t are different");
+  gpu_resource_.setDefaultStream(device_, stream);
+  index_->search(
+    batch_size, queries, k, distances, reinterpret_cast<faiss::Index::idx_t*>(neighbors));
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void FaissGpu<T>::save_(const std::string& file) const
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  auto cpu_index = std::make_unique<CpuIndex>();
+  dynamic_cast<GpuIndex*>(index_.get())->copyTo(cpu_index.get());
+  faiss::write_index(cpu_index.get(), file.c_str());
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void FaissGpu<T>::load_(const std::string& file)
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  std::unique_ptr<CpuIndex> cpu_index(dynamic_cast<CpuIndex*>(faiss::read_index(file.c_str())));
+  assert(cpu_index);
+  dynamic_cast<GpuIndex*>(index_.get())->copyFrom(cpu_index.get());
+}
+
+template <typename T>
+class FaissGpuIVFFlat : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+  };
+
+  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFFlat>(
+      &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+};
+
+template <typename T>
+class FaissGpuIVFPQ : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+    int M;
+    bool useFloat16;
+    bool usePrecomputed;
+  };
+
+  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.useFloat16LookupTables = param.useFloat16;
+    config.usePrecomputedTables   = param.usePrecomputed;
+    config.device                 = this->device_;
+    this->index_ =
+      std::make_unique<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
+                                                  dim,
+                                                  param.nlist,
+                                                  param.M,
+                                                  8,  // FAISS only supports bitsPerCode=8
+                                                  this->metric_type_,
+                                                  config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+};
+
+template <typename T>
+class FaissGpuIVFSQ : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+    std::string quantizer_type;
+  };
+
+  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::ScalarQuantizer::QuantizerType qtype;
+    if (param.quantizer_type == "fp16") {
+      qtype = faiss::ScalarQuantizer::QT_fp16;
+    } else if (param.quantizer_type == "int8") {
+      qtype = faiss::ScalarQuantizer::QT_8bit;
+    } else {
+      throw std::runtime_error("FaissGpuIVFSQ supports only fp16 and int8 but got " +
+                               param.quantizer_type);
+    }
+
+    faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFScalarQuantizer>(
+      &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+};
+
+template <typename T>
+class FaissGpuFlat : public FaissGpu<T> {
+ public:
+  FaissGpuFlat(Metric metric, int dim) : FaissGpu<T>(metric, dim, 0)
+  {
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexFlat>(
+      &(this->gpu_resource_), dim, this->metric_type_, config);
+  }
+
+  // class FaissGpu is more like a IVF class, so need special treating here
+  void set_search_param(const typename ANN<T>::AnnSearchParam&) override{};
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+};
+
+}  // namespace raft::bench::ann
+
+#endif
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
new file mode 100644
index 0000000000..8072cd857c
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann_types.hpp"
+#include "ggnn_wrapper.cuh"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::Ggnn<T>::BuildParam& param)
+{
+  param.dataset_size = conf.at("dataset_size");
+  param.k            = conf.at("k");
+
+  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
+  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
+  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
+  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
+  if (conf.contains("refine_iterations")) {
+    param.refine_iterations = conf.at("refine_iterations");
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::Ggnn<T>::SearchParam& param)
+{
+  param.tau = conf.at("tau");
+
+  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
+  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {}
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (algo == "ggnn") { ann = make_algo<T, raft::bench::ann::Ggnn>(metric, dim, conf); }
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "ggnn") {
+    auto param = std::make_unique<typename raft::bench::ann::Ggnn<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
new file mode 100644
index 0000000000..fd8fe0f2ec
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+
+#include "../common/ann_types.hpp"
+#include "../common/benchmark_util.hpp"
+#include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class GgnnImpl;
+
+template <typename T>
+class Ggnn : public ANN<T> {
+ public:
+  struct BuildParam {
+    int k_build{24};       // KBuild
+    int segment_size{32};  // S
+    int num_layers{4};     // L
+    float tau{0.5};
+    int refine_iterations{2};
+
+    size_t dataset_size;
+    int k;  // GGNN requires to know k during building
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    float tau;
+    int block_dim{32};
+    int max_iterations{400};
+    int cache_size{512};
+    int sorted_size{256};
+  };
+
+  Ggnn(Metric metric, int dim, const BuildParam& param);
+  ~Ggnn() { delete impl_; }
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override
+  {
+    impl_->build(dataset, nrow, stream);
+  }
+
+  void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); }
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override
+  {
+    impl_->search(queries, batch_size, k, neighbors, distances, stream);
+  }
+
+  void save(const std::string& file) const override { impl_->save(file); }
+  void load(const std::string& file) override { impl_->load(file); }
+
+  AlgoProperty get_property() const override { return impl_->get_property(); }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override
+  {
+    impl_->set_search_dataset(dataset, nrow);
+  };
+
+ private:
+  ANN<T>* impl_;
+};
+
+template <typename T>
+Ggnn<T>::Ggnn(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric, dim)
+{
+  // ggnn/src/sift1m.cu
+  if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 &&
+      param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Euclidean, 128, 24, 10, 32>(metric, dim, param);
+  }
+  // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B
+  else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 &&
+           param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Euclidean, 96, 24, 10, 32>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 &&
+             param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Cosine, 96, 24, 10, 32>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 &&
+             param.segment_size == 64) {
+    impl_ = new GgnnImpl<T, Cosine, 96, 96, 10, 64>(metric, dim, param);
+  }
+  // ggnn/src/glove200.cu, adapt it to glove100
+  else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 &&
+           param.segment_size == 64) {
+    impl_ = new GgnnImpl<T, Cosine, 100, 96, 10, 64>(metric, dim, param);
+  } else {
+    throw std::runtime_error(
+      "ggnn: not supported combination of metric, dim and build param; "
+      "see Ggnn's constructor in ggnn_wrapper.cuh for available combinations");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class GgnnImpl : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  GgnnImpl(Metric metric, int dim, const typename Ggnn<T>::BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& file) const override;
+  void load(const std::string& file) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;
+    return property;
+  }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+ private:
+  using ANN<T>::metric_;
+  using ANN<T>::dim_;
+
+  using GGNNGPUInstance = GGNNGPUInstance<measure,
+                                          int64_t /* KeyT */,
+                                          float /* ValueT */,
+                                          size_t /* GAddrT */,
+                                          T /* BaseT */,
+                                          size_t /* BAddrT */,
+                                          D,
+                                          KBuild,
+                                          KBuild / 2 /* KF */,
+                                          KQuery,
+                                          S>;
+  std::unique_ptr<GGNNGPUInstance> ggnn_;
+  typename Ggnn<T>::BuildParam build_param_;
+  typename Ggnn<T>::SearchParam search_param_;
+};
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
+                                                     int dim,
+                                                     const typename Ggnn<T>::BuildParam& param)
+  : ANN<T>(metric, dim), build_param_(param)
+{
+  if (metric_ == Metric::kInnerProduct) {
+    if (measure != Cosine) { throw std::runtime_error("mis-matched metric"); }
+  } else if (metric_ == Metric::kEuclidean) {
+    if (measure != Euclidean) { throw std::runtime_error("mis-matched metric"); }
+  } else {
+    throw std::runtime_error(
+      "ggnn supports only metric type of InnerProduct, Cosine and Euclidean");
+  }
+
+  if (dim != D) { throw std::runtime_error("mis-matched dim"); }
+
+  int device;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+
+  ggnn_ = std::make_unique<GGNNGPUInstance>(
+    device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
+                                                       size_t nrow,
+                                                       cudaStream_t stream)
+{
+  if (nrow != build_param_.dataset_size) {
+    throw std::runtime_error(
+      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
+      " , but nrow = " + std::to_string(nrow));
+  }
+
+  ggnn_->set_base_data(dataset);
+  ggnn_->set_stream(stream);
+  ggnn_->build(0);
+  for (int i = 0; i < build_param_.refine_iterations; ++i) {
+    ggnn_->refine();
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  if (nrow != build_param_.dataset_size) {
+    throw std::runtime_error(
+      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
+      " , but nrow = " + std::to_string(nrow));
+  }
+  ggnn_->set_base_data(dataset);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_param(const AnnSearchParam& param)
+{
+  search_param_ = dynamic_cast<const typename Ggnn<T>::SearchParam&>(param);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(const T* queries,
+                                                        int batch_size,
+                                                        int k,
+                                                        size_t* neighbors,
+                                                        float* distances,
+                                                        cudaStream_t stream) const
+{
+  static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
+  if (k != KQuery) {
+    throw std::runtime_error(
+      "k = " + std::to_string(k) +
+      ", but this GGNN instance only supports k = " + std::to_string(KQuery));
+  }
+
+  ggnn_->set_stream(stream);
+  RAFT_CUDA_TRY(cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float)));
+
+  const int block_dim      = search_param_.block_dim;
+  const int max_iterations = search_param_.max_iterations;
+  const int cache_size     = search_param_.cache_size;
+  const int sorted_size    = search_param_.sorted_size;
+  // default value
+  if (block_dim == 32 && max_iterations == 400 && cache_size == 512 && sorted_size == 256) {
+    ggnn_->template queryLayer<32, 400, 512, 256, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 200 && cache_size == 256 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 200, 256, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 400 && cache_size == 448 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 400, 448, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/glove200.cu
+  else if (block_dim == 128 && max_iterations == 2000 && cache_size == 2048 && sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 2048, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // for glove100
+  else if (block_dim == 64 && max_iterations == 400 && cache_size == 512 && sorted_size == 32) {
+    ggnn_->template queryLayer<64, 400, 512, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else if (block_dim == 128 && max_iterations == 2000 && cache_size == 1024 &&
+             sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 1024, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else {
+    throw std::runtime_error("ggnn: not supported search param");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::save(const std::string& file) const
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(0);
+
+  ggnn_host.downloadAsync(ggnn_device);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream));
+  ggnn_host.store(file);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(0);
+
+  ggnn_host.load(file);
+  ggnn_host.uploadAsync(ggnn_device);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream));
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
new file mode 100644
index 0000000000..cd823e8a69
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/benchmark_util.hpp"
+
+#include "../common/ann_types.hpp"
+#undef WARP_SIZE
+#include "hnswlib_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::HnswLib<T>::BuildParam& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.M               = conf.at("M");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::HnswLib<T>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
+  }
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "hnswlib") {
+    auto param = std::make_unique<typename raft::bench::ann::HnswLib<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+};  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
new file mode 100644
index 0000000000..c5c3a4a2a6
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <ctime>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "../common/ann_types.hpp"
+#include <hnswlib.h>
+
+namespace raft::bench::ann {
+
+template <typename T>
+struct hnsw_dist_t {
+  using type = void;
+};
+
+template <>
+struct hnsw_dist_t<float> {
+  using type = float;
+};
+
+template <>
+struct hnsw_dist_t<uint8_t> {
+  using type = int;
+};
+
+class FixedThreadPool {
+ public:
+  FixedThreadPool(int num_threads)
+  {
+    if (num_threads < 1) {
+      throw std::runtime_error("num_threads must >= 1");
+    } else if (num_threads == 1) {
+      return;
+    }
+
+    tasks_ = new Task_[num_threads];
+
+    threads_.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      threads_.emplace_back([&, i] {
+        auto& task = tasks_[i];
+        while (true) {
+          std::unique_lock<std::mutex> lock(task.mtx);
+          task.cv.wait(lock,
+                       [&] { return task.has_task || finished_.load(std::memory_order_relaxed); });
+          if (finished_.load(std::memory_order_relaxed)) { break; }
+
+          task.task();
+          task.has_task = false;
+        }
+      });
+    }
+  }
+
+  ~FixedThreadPool()
+  {
+    if (threads_.empty()) { return; }
+
+    finished_.store(true, std::memory_order_relaxed);
+    for (unsigned i = 0; i < threads_.size(); ++i) {
+      auto& task = tasks_[i];
+      std::lock_guard<std::mutex>(task.mtx);
+
+      task.cv.notify_one();
+      threads_[i].join();
+    }
+
+    delete[] tasks_;
+  }
+
+  template <typename Func, typename IdxT>
+  void submit(Func f, IdxT len)
+  {
+    if (threads_.empty()) {
+      for (IdxT i = 0; i < len; ++i) {
+        f(i);
+      }
+      return;
+    }
+
+    const int num_threads = threads_.size();
+    // one extra part for competition among threads
+    const IdxT items_per_thread = len / (num_threads + 1);
+    std::atomic<IdxT> cnt(items_per_thread * num_threads);
+
+    auto wrapped_f = [&](IdxT start, IdxT end) {
+      for (IdxT i = start; i < end; ++i) {
+        f(i);
+      }
+
+      while (true) {
+        IdxT i = cnt.fetch_add(1, std::memory_order_relaxed);
+        if (i >= len) { break; }
+        f(i);
+      }
+    };
+
+    std::vector<std::future<void>> futures;
+    futures.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      IdxT start = i * items_per_thread;
+      auto& task = tasks_[i];
+      {
+        std::lock_guard lock(task.mtx);
+        (void)lock;  // stop nvcc warning
+        task.task = std::packaged_task<void()>([=] { wrapped_f(start, start + items_per_thread); });
+        futures.push_back(task.task.get_future());
+        task.has_task = true;
+      }
+      task.cv.notify_one();
+    }
+
+    for (auto& fut : futures) {
+      fut.wait();
+    }
+    return;
+  }
+
+ private:
+  struct alignas(64) Task_ {
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool has_task = false;
+    std::packaged_task<void()> task;
+  };
+
+  Task_* tasks_;
+  std::vector<std::thread> threads_;
+  std::atomic<bool> finished_{false};
+};
+
+template <typename T>
+class HnswLib : public ANN<T> {
+ public:
+  // https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+  struct BuildParam {
+    int M;
+    int ef_construction;
+    int num_threads{1};
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads{1};
+  };
+
+  HnswLib(Metric metric, int dim, const BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void search(const T* query,
+              int batch_size,
+              int k,
+              size_t* indices,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& path_to_index) const override;
+  void load(const std::string& path_to_index) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Host;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+
+ private:
+  void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
+
+  std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
+  std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
+
+  using ANN<T>::metric_;
+  using ANN<T>::dim_;
+  int ef_construction_;
+  int m_;
+  int num_threads_;
+  std::unique_ptr<FixedThreadPool> thread_pool_;
+};
+
+template <typename T>
+HnswLib<T>::HnswLib(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric, dim)
+{
+  assert(dim_ > 0);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t>);
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (metric_ != Metric::kEuclidean) {
+      throw std::runtime_error("hnswlib<uint8_t> only supports Euclidean distance");
+    }
+  }
+
+  ef_construction_ = param.ef_construction;
+  m_               = param.M;
+  num_threads_     = param.num_threads;
+}
+
+template <typename T>
+void HnswLib<T>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+  }
+
+  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), nrow, m_, ef_construction_);
+
+  thread_pool_                  = std::make_unique<FixedThreadPool>(num_threads_);
+  const size_t items_per_thread = nrow / (num_threads_ + 1);
+
+  thread_pool_->submit(
+    [&](size_t i) {
+      if (i < items_per_thread && i % 10000 == 0) {
+        char buf[20];
+        std::time_t now = std::time(nullptr);
+        std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+
+        printf("%s building %zu / %zu\n", buf, i, items_per_thread);
+        fflush(stdout);
+      }
+
+      appr_alg_->addPoint(dataset + i * dim_, i);
+    },
+    nrow);
+}
+
+template <typename T>
+void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
+{
+  auto param     = dynamic_cast<const SearchParam&>(param_);
+  appr_alg_->ef_ = param.ef;
+
+  if (!thread_pool_ || num_threads_ != param.num_threads) {
+    num_threads_ = param.num_threads;
+    thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
+  }
+}
+
+template <typename T>
+void HnswLib<T>::search(
+  const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const
+{
+  thread_pool_->submit(
+    [&](int i) {
+      get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k);
+    },
+    batch_size);
+}
+
+template <typename T>
+void HnswLib<T>::save(const std::string& path_to_index) const
+{
+  appr_alg_->saveIndex(std::string(path_to_index));
+}
+
+template <typename T>
+void HnswLib<T>::load(const std::string& path_to_index)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+  }
+
+  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), path_to_index);
+}
+
+template <typename T>
+void HnswLib<T>::get_search_knn_results_(const T* query,
+                                         int k,
+                                         size_t* indices,
+                                         float* distances) const
+{
+  auto result = appr_alg_->searchKnn(query, k);
+  assert(result.size() >= static_cast<size_t>(k));
+
+  for (int i = k - 1; i >= 0; --i) {
+    indices[i]   = result.top().second;
+    distances[i] = result.top().first;
+    result.pop();
+  }
+}
+
+};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
new file mode 100644
index 0000000000..cb30c2693f
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace raft::bench::ann {
+
+inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
+{
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
+    return raft::distance::DistanceType::InnerProduct;
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
+    // Even for L2 expanded RAFT IVF Flat uses unexpanded formula
+    return raft::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
new file mode 100644
index 0000000000..d8e98ce2a9
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
+#include "../common/ann_types.hpp"
+#include "../common/benchmark_util.hpp"
+#undef WARP_SIZE
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#include "raft_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+#include "raft_ivf_flat_wrapper.h"
+extern template class raft::bench::ann::RaftIvfFlatGpu<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<int8_t, int64_t>;
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+#include "raft_ivf_pq_wrapper.h"
+extern template class raft::bench::ann::RaftIvfPQ<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<int8_t, int64_t>;
+#endif
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
+    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
+{
+  param.pq_param.n_probes = conf.at("numProbes");
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+}
+#endif
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+    if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    typename raft::bench::ann::RaftIvfFlatGpu<T, int64_t>::BuildParam param;
+    parse_build_param<T, int64_t>(conf, param);
+    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, int64_t>>(metric, dim, param);
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    typename raft::bench::ann::RaftIvfPQ<T, int64_t>::BuildParam param;
+    parse_build_param<T, int64_t>(conf, param);
+    ann =
+      std::make_unique<raft::bench::ann::RaftIvfPQ<T, int64_t>>(metric, dim, param, refine_ratio);
+  }
+#endif
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+  if (algo == "raft_bfknn") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    auto param =
+      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, int64_t>::SearchParam>();
+    parse_search_param<T, int64_t>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, int64_t>::SearchParam>();
+    parse_search_param<T, int64_t>(conf, *param);
+    return param;
+  }
+#endif
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+};  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/src/distance/distance/specializations/detail/canberra_float_float_float_int.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
similarity index 51%
rename from cpp/src/distance/distance/specializations/detail/canberra_float_float_float_int.cu
rename to cpp/bench/ann/src/raft/raft_ivf_flat.cu
index 6dfc385e55..ff108080b5 100644
--- a/cpp/src/distance/distance/specializations/detail/canberra_float_float_float_int.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,26 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "raft_ivf_flat_wrapper.h"
 
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
 
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::Canberra, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+namespace raft::bench::ann {
+template class RaftIvfFlatGpu<float, int64_t>;
+template class RaftIvfFlatGpu<uint8_t, int64_t>;
+template class RaftIvfFlatGpu<int8_t, int64_t>;
+}  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
new file mode 100644
index 0000000000..8b2a7d329b
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+#include "raft_ann_bench_utils.h"
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, typename IdxT>
+class RaftIvfFlatGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    raft::neighbors::ivf_flat::search_params ivf_flat_params;
+  };
+
+  using BuildParam = raft::neighbors::ivf_flat::index_params;
+
+  RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  BuildParam index_params_;
+  raft::neighbors::ivf_flat::search_params search_params_;
+  std::optional<raft::neighbors::ivf_flat::index<T, IdxT>> index_;
+  int device_;
+  int dimension_;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
+};
+
+template <typename T, typename IdxT>
+RaftIvfFlatGpu<T, IdxT>::RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param)
+  : ANN<T>(metric, dim),
+    index_params_(param),
+    dimension_(dim),
+    mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+{
+  index_params_.metric = parse_metric_type(metric);
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  index_.emplace(
+    raft::neighbors::ivf_flat::build(handle_, index_params_, dataset, IdxT(nrow), dimension_));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::set_search_param(const AnnSearchParam& param)
+{
+  auto search_param = dynamic_cast<const SearchParam&>(param);
+  search_params_    = search_param.ivf_flat_params;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::save(const std::string& file) const
+{
+  raft::neighbors::ivf_flat::serialize(handle_, file, *index_);
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::load(const std::string& file)
+{
+  index_ = raft::neighbors::ivf_flat::deserialize<T, IdxT>(handle_, file);
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::search(
+  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+{
+  rmm::mr::device_memory_resource* mr_ptr = &const_cast<RaftIvfFlatGpu*>(this)->mr_;
+  static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
+  raft::neighbors::ivf_flat::search(
+    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances, mr_ptr);
+  handle_.sync_stream();
+  return;
+}
+}  // namespace raft::bench::ann
diff --git a/cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
similarity index 50%
rename from cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
rename to cpp/bench/ann/src/raft/raft_ivf_pq.cu
index e418fc455f..338bc9a32f 100644
--- a/cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,26 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "raft_ivf_pq_wrapper.h"
 
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
 
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::Linf, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+namespace raft::bench::ann {
+template class RaftIvfPQ<float, int64_t>;
+template class RaftIvfPQ<uint8_t, int64_t>;
+template class RaftIvfPQ<int8_t, int64_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
new file mode 100644
index 0000000000..70dff81847
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+#include <raft_runtime/neighbors/refine.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+#include "raft_ann_bench_utils.h"
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, typename IdxT>
+class RaftIvfPQ : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    raft::neighbors::ivf_pq::search_params pq_param;
+  };
+
+  using BuildParam = raft::neighbors::ivf_pq::index_params;
+
+  RaftIvfPQ(Metric metric, int dim, const BuildParam& param, float refine_ratio);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;  // actually it is only used during refinement
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  BuildParam index_params_;
+  raft::neighbors::ivf_pq::search_params search_params_;
+  std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
+  int device_;
+  int dimension_;
+  float refine_ratio_ = 1.0;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
+  raft::device_matrix_view<const T, IdxT> dataset_;
+};
+template <typename T, typename IdxT>
+RaftIvfPQ<T, IdxT>::RaftIvfPQ(Metric metric, int dim, const BuildParam& param, float refine_ratio)
+  : ANN<T>(metric, dim),
+    index_params_(param),
+    dimension_(dim),
+    refine_ratio_(refine_ratio),
+    mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+{
+  index_params_.metric = parse_metric_type(metric);
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::save(const std::string& file) const
+{
+  raft::runtime::neighbors::ivf_pq::serialize(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::load(const std::string& file)
+{
+  auto index_tmp = raft::neighbors::ivf_pq::index<IdxT>(handle_, index_params_, dimension_);
+  raft::runtime::neighbors::ivf_pq::deserialize(handle_, file, &index_tmp);
+  index_.emplace(std::move(index_tmp));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), index_->dim());
+
+  index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::set_search_param(const AnnSearchParam& param)
+{
+  auto search_param = dynamic_cast<const SearchParam&>(param);
+  search_params_    = search_param.pq_param;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::search(const T* queries,
+                                int batch_size,
+                                int k,
+                                size_t* neighbors,
+                                float* distances,
+                                cudaStream_t stream) const
+{
+  if (refine_ratio_ > 1.0f) {
+    uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
+    auto queries_v =
+      raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+    auto distances_tmp = raft::make_device_matrix<float, IdxT>(handle_, batch_size, k0);
+    auto candidates    = raft::make_device_matrix<IdxT, IdxT>(handle_, batch_size, k0);
+
+    raft::runtime::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view());
+
+    if (get_property().dataset_memory_type == MemoryType::Device) {
+      auto queries_v =
+        raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
+      auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_,
+                                       queries_v,
+                                       candidates.view(),
+                                       neighbors_v,
+                                       distances_v,
+                                       index_->metric());
+    } else {
+      auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
+      auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
+
+      raft::copy(queries_host.data_handle(), queries, queries_host.size(), handle_.get_stream());
+      raft::copy(candidates_host.data_handle(),
+                 candidates.data_handle(),
+                 candidates_host.size(),
+                 handle_.get_stream());
+
+      auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
+        dataset_.data_handle(), batch_size, index_->dim());
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_v,
+                                       queries_host.view(),
+                                       candidates_host.view(),
+                                       neighbors_host.view(),
+                                       distances_host.view(),
+                                       index_->metric());
+
+      raft::copy(neighbors,
+                 (size_t*)neighbors_host.data_handle(),
+                 neighbors_host.size(),
+                 handle_.get_stream());
+      raft::copy(
+        distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream());
+    }
+  } else {
+    auto queries_v =
+      raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+    auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
+    auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
+
+    raft::runtime::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
+  }
+  handle_.sync_stream();
+  return;
+}
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
new file mode 100644
index 0000000000..01f206ab70
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+
+namespace raft_temp {
+
+inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
+{
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
+    return raft::distance::DistanceType::InnerProduct;
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
+    return raft::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+
+}  // namespace raft_temp
+
+namespace raft::bench::ann {
+
+// brute force fused L2 KNN - RAFT
+template <typename T>
+class RaftGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  RaftGpu(Metric metric, int dim);
+
+  void build(const T*, size_t, cudaStream_t) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const final;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;
+    return property;
+  }
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+  void save(const std::string& file) const override;
+  void load(const std::string&) override { return; };
+
+ protected:
+  raft::distance::DistanceType metric_type_;
+  int device_;
+  const T* dataset_;
+  size_t nrow_;
+};
+
+template <typename T>
+RaftGpu<T>::RaftGpu(Metric metric, int dim)
+  : ANN<T>(metric, dim), metric_type_(raft_temp::parse_metric_type(metric))
+{
+  static_assert(std::is_same_v<T, float>, "raft support only float type");
+  assert(metric_type_ == raft::distance::DistanceType::L2Expanded);
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void RaftGpu<T>::build(const T*, size_t, cudaStream_t)
+{
+  // as this is brute force algo so no index building required
+  return;
+}
+
+template <typename T>
+void RaftGpu<T>::set_search_param(const AnnSearchParam&)
+{
+  // Nothing to set here as it is brute force implementation
+}
+
+template <typename T>
+void RaftGpu<T>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = dataset;
+  nrow_    = nrow;
+}
+
+template <typename T>
+void RaftGpu<T>::save(const std::string& file) const
+{
+  // create a empty index file as no index to store.
+  std::fstream fp;
+  fp.open(file.c_str(), std::ios::out);
+  if (!fp) {
+    printf("Error in creating file!!!\n");
+    ;
+    return;
+  }
+  fp.close();
+}
+
+template <typename T>
+void RaftGpu<T>::search(const T* queries,
+                        int batch_size,
+                        int k,
+                        size_t* neighbors,
+                        float* distances,
+                        cudaStream_t stream) const
+{
+  raft::spatial::knn::detail::fusedL2Knn(this->dim_,
+                                         reinterpret_cast<int64_t*>(neighbors),
+                                         distances,
+                                         dataset_,
+                                         queries,
+                                         nrow_,
+                                         static_cast<size_t>(batch_size),
+                                         k,
+                                         true,
+                                         true,
+                                         stream,
+                                         metric_type_);
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu
deleted file mode 100644
index 3b6f031c77..0000000000
--- a/cpp/bench/matrix/select_k.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft_internal/matrix/select_k.cuh>
-
-#include <common/benchmark.hpp>
-
-#include <raft/core/device_resources.hpp>
-#include <raft/random/rng.cuh>
-#include <raft/sparse/detail/utils.h>
-#include <raft/util/cudart_utils.hpp>
-
-#if defined RAFT_DISTANCE_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
-#include <raft/matrix/detail/select_radix.cuh>
-#include <raft/matrix/detail/select_warpsort.cuh>
-#include <raft/matrix/select_k.cuh>
-
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-namespace raft::matrix {
-
-using namespace raft::bench;  // NOLINT
-
-template <typename KeyT, typename IdxT, select::Algo Algo>
-struct selection : public fixture {
-  explicit selection(const select::params& p)
-    : params_(p),
-      in_dists_(p.batch_size * p.len, stream),
-      in_ids_(p.batch_size * p.len, stream),
-      out_dists_(p.batch_size * p.k, stream),
-      out_ids_(p.batch_size * p.k, stream)
-  {
-    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream);
-    raft::random::RngState state{42};
-    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
-  }
-
-  void run_benchmark(::benchmark::State& state) override  // NOLINT
-  {
-    device_resources handle{stream};
-    using_pool_memory_res res;
-    try {
-      std::ostringstream label_stream;
-      label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
-      state.SetLabel(label_stream.str());
-      loop_on_state(state, [this, &handle]() {
-        select::select_k_impl<KeyT, IdxT>(handle,
-                                          Algo,
-                                          in_dists_.data(),
-                                          in_ids_.data(),
-                                          params_.batch_size,
-                                          params_.len,
-                                          params_.k,
-                                          out_dists_.data(),
-                                          out_ids_.data(),
-                                          params_.select_min);
-      });
-    } catch (raft::exception& e) {
-      state.SkipWithError(e.what());
-    }
-  }
-
- private:
-  const select::params params_;
-  rmm::device_uvector<KeyT> in_dists_, out_dists_;
-  rmm::device_uvector<IdxT> in_ids_, out_ids_;
-};
-
-const std::vector<select::params> kInputs{
-  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
-  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
-  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
-
-  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
-  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
-  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
-
-  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
-  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
-  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
-
-  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
-  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
-  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
-};
-
-#define SELECTION_REGISTER(KeyT, IdxT, A)                          \
-  namespace BENCHMARK_PRIVATE_NAME(selection)                      \
-  {                                                                \
-    using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
-    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
-  }
-
-SELECTION_REGISTER(float, uint32_t, kPublicApi);           // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix8bits);          // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bits);         // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpAuto);            // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpImmediate);       // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpFiltered);        // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributed);     // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);  // NOLINT
-
-SELECTION_REGISTER(double, uint32_t, kRadix8bits);   // NOLINT
-SELECTION_REGISTER(double, uint32_t, kRadix11bits);  // NOLINT
-SELECTION_REGISTER(double, uint32_t, kWarpAuto);     // NOLINT
-
-SELECTION_REGISTER(double, int64_t, kRadix8bits);          // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bits);         // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpImmediate);       // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpFiltered);        // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributed);     // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);  // NOLINT
-
-}  // namespace raft::matrix
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
new file mode 100644
index 0000000000..f03a552c1d
--- /dev/null
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -0,0 +1,141 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+function(ConfigureBench)
+
+  set(options OPTIONAL LIB)
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
+
+  cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(BENCH_NAME ${ConfigureBench_NAME})
+
+  add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
+
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            raft_internal
+            $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
+            benchmark::benchmark
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  target_include_directories(${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>")
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/gbench/prims/libraft
+    EXCLUDE_FROM_ALL
+  )
+
+endfunction()
+
+if(BUILD_BENCH)
+  ConfigureBench(
+    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
+    bench/prims/main.cpp OPTIONAL LIB
+  )
+
+  ConfigureBench(
+    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
+    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME
+    DISTANCE_BENCH
+    PATH
+    bench/prims/distance/distance_cosine.cu
+    bench/prims/distance/distance_exp_l2.cu
+    bench/prims/distance/distance_l1.cu
+    bench/prims/distance/distance_unexp_l2.cu
+    bench/prims/distance/fused_l2_nn.cu
+    bench/prims/distance/masked_nn.cu
+    bench/prims/distance/kernels.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    LIB
+  )
+
+  ConfigureBench(
+    NAME
+    LINALG_BENCH
+    PATH
+    bench/prims/linalg/add.cu
+    bench/prims/linalg/map_then_reduce.cu
+    bench/prims/linalg/matrix_vector_op.cu
+    bench/prims/linalg/norm.cu
+    bench/prims/linalg/normalize.cu
+    bench/prims/linalg/reduce_cols_by_key.cu
+    bench/prims/linalg/reduce_rows_by_key.cu
+    bench/prims/linalg/reduce.cu
+    bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB
+  )
+
+  ConfigureBench(
+    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
+    bench/prims/random/rng.cu bench/prims/main.cpp
+  )
+
+  ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
+
+  ConfigureBench(
+    NAME
+    NEIGHBORS_BENCH
+    PATH
+    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
+    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
+    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    bench/prims/neighbors/refine_float_int64_t.cu
+    bench/prims/neighbors/refine_uint8_t_int64_t.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    LIB
+  )
+endif()
diff --git a/cpp/bench/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu
similarity index 99%
rename from cpp/bench/cluster/kmeans.cu
rename to cpp/bench/prims/cluster/kmeans.cu
index f593ec090d..af7afb8037 100644
--- a/cpp/bench/cluster/kmeans.cu
+++ b/cpp/bench/prims/cluster/kmeans.cu
@@ -18,7 +18,7 @@
 #include <raft/cluster/kmeans.cuh>
 #include <raft/cluster/kmeans_types.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/cluster/specializations.cuh>
 #endif
 
diff --git a/cpp/bench/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
similarity index 99%
rename from cpp/bench/cluster/kmeans_balanced.cu
rename to cpp/bench/prims/cluster/kmeans_balanced.cu
index 8dda155a59..6bda43bdb2 100644
--- a/cpp/bench/cluster/kmeans_balanced.cu
+++ b/cpp/bench/prims/cluster/kmeans_balanced.cu
@@ -18,7 +18,7 @@
 #include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/random/rng.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/cluster/specializations.cuh>
 #endif
 
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp
similarity index 100%
rename from cpp/bench/common/benchmark.hpp
rename to cpp/bench/prims/common/benchmark.hpp
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh
similarity index 99%
rename from cpp/bench/distance/distance_common.cuh
rename to cpp/bench/prims/distance/distance_common.cuh
index 906271bf5a..9b5d67a46f 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/prims/distance/distance_common.cuh
@@ -17,7 +17,7 @@
 #include <common/benchmark.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/bench/distance/distance_cosine.cu b/cpp/bench/prims/distance/distance_cosine.cu
similarity index 94%
rename from cpp/bench/distance/distance_cosine.cu
rename to cpp/bench/prims/distance/distance_cosine.cu
index 20f29ce4ef..c8ac8067c8 100644
--- a/cpp/bench/distance/distance_cosine.cu
+++ b/cpp/bench/prims/distance/distance_cosine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_exp_l2.cu b/cpp/bench/prims/distance/distance_exp_l2.cu
similarity index 94%
rename from cpp/bench/distance/distance_exp_l2.cu
rename to cpp/bench/prims/distance/distance_exp_l2.cu
index 5a3af17193..52b7fff05c 100644
--- a/cpp/bench/distance/distance_exp_l2.cu
+++ b/cpp/bench/prims/distance/distance_exp_l2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_l1.cu b/cpp/bench/prims/distance/distance_l1.cu
similarity index 93%
rename from cpp/bench/distance/distance_l1.cu
rename to cpp/bench/prims/distance/distance_l1.cu
index 2ad7d5e957..e80db48ef0 100644
--- a/cpp/bench/distance/distance_l1.cu
+++ b/cpp/bench/prims/distance/distance_l1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_unexp_l2.cu b/cpp/bench/prims/distance/distance_unexp_l2.cu
similarity index 94%
rename from cpp/bench/distance/distance_unexp_l2.cu
rename to cpp/bench/prims/distance/distance_unexp_l2.cu
index 406aca2378..7ac1a8a4b5 100644
--- a/cpp/bench/distance/distance_unexp_l2.cu
+++ b/cpp/bench/prims/distance/distance_unexp_l2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
similarity index 99%
rename from cpp/bench/distance/fused_l2_nn.cu
rename to cpp/bench/prims/distance/fused_l2_nn.cu
index 7531784707..1c45572782 100644
--- a/cpp/bench/distance/fused_l2_nn.cu
+++ b/cpp/bench/prims/distance/fused_l2_nn.cu
@@ -17,7 +17,7 @@
 #include <common/benchmark.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
similarity index 99%
rename from cpp/bench/distance/kernels.cu
rename to cpp/bench/prims/distance/kernels.cu
index 027f93171e..4407bdcf83 100644
--- a/cpp/bench/distance/kernels.cu
+++ b/cpp/bench/prims/distance/kernels.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 
diff --git a/cpp/bench/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
similarity index 99%
rename from cpp/bench/distance/masked_nn.cu
rename to cpp/bench/prims/distance/masked_nn.cu
index 1fecb455c3..f9f234187d 100644
--- a/cpp/bench/distance/masked_nn.cu
+++ b/cpp/bench/prims/distance/masked_nn.cu
@@ -30,7 +30,7 @@
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#ifdef RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 
diff --git a/cpp/bench/prims/distance/tune_pairwise/bench.cu b/cpp/bench/prims/distance/tune_pairwise/bench.cu
new file mode 100644
index 0000000000..87159ab1b1
--- /dev/null
+++ b/cpp/bench/prims/distance/tune_pairwise/bench.cu
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Tuning benchmarks.
+//
+// Goals:
+//
+// 1. Fast compile times to maintain iteration speed.
+// 2. Create benchmarks that can inform the design of the kernels.
+//
+// Non-goals:
+//
+// 1. Measure every distance operation. Instead measures just one distance
+//    operation at the same time.
+// 2. Be useful for finding performance regressions. This is handled by the
+//    normal benchmarks.
+//
+// So far, both goals are partly achieved.
+//
+// RE (1), COMPILE TIMES: kernel.cu is fast to compile. This file is not.
+// When the internals of a pairwise distance kernel is changed, this file is not
+// recompiled.
+//
+// RE 2, benchmarks with intent: this file contains a benchmark to check the
+// maximal throughput of a kernel. Measuring other things, like performance on
+// skinny or wide matrices is not yet implemented.
+
+#include "kernel.cuh"                                       // launch_kernel
+#include <algorithm>                                        // std::min
+#include <common/benchmark.hpp>                             // RAFT_BENCH_REGISTER
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+#include <rmm/device_uvector.hpp>                           // rmm::device_uvector
+#include <vector>                                           // std::vector
+
+namespace raft::bench::distance::tune {
+
+// Max throughput benchmark.
+//
+// Goal: Measure the maximum distances/sec that can be computed.
+//
+// To achieve this, we make sure that:
+//
+// - Input data size is a multiple of the block tile size.
+//
+// - Perfect distribution of work between SMs, i.e. the number of block tiles is
+//   a large multiple (num_waves) of the number of blocks (#SMs * occupancy).
+//
+// - Multiple iterations over Kblk are executed (num_k_iters).
+struct throughput_param {
+  int num_waves;
+  int occupancy;
+  int num_k_iters;
+};
+
+const std::vector<throughput_param> throughput_params{
+  // 32 waves, requested occupancy of 4, and 32 k iterations typically achieves
+  // maximum throughput. No need to pick higher values.
+  {32, 4, 32},
+};
+
+struct throughput_bench : public fixture {
+  const throughput_param p;
+
+  throughput_bench(const throughput_param& p_) : p(p_) {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    // Get block size:
+    int block_m, block_n, block_k;
+    get_block_size(block_m, block_n, block_k);
+
+    // Determine number of blocks that will be launched. This informs the size
+    // of the inputs as well as the grid size.
+    const int num_sms       = raft::getMultiProcessorCount();
+    const int max_occupancy = get_max_occupancy();
+    const int occupancy     = std::min(p.occupancy, max_occupancy);
+    const int num_blocks    = occupancy * num_sms;
+    dim3 grid(num_blocks);
+
+    // Create input sizes that are a multiple of the block tile size.
+    size_t m = block_m;
+    size_t n = block_n * p.num_waves * num_blocks;
+    size_t k = block_k * p.num_k_iters;
+
+    // DataT, OutT, IdxT, etc, are defined in tuned_kernel.cuh
+    rmm::device_uvector<DataT> x_vec(m * k, stream);
+    rmm::device_uvector<DataT> y_vec(n * k, stream);
+    rmm::device_uvector<DataT> x_norm_vec(m, stream);
+    rmm::device_uvector<DataT> y_norm_vec(n, stream);
+    rmm::device_uvector<OutT> out_vec(m * n, stream);
+
+    auto x      = x_vec.data();
+    auto y      = y_vec.data();
+    auto x_norm = x_norm_vec.data();
+    auto y_norm = y_norm_vec.data();
+    auto out    = out_vec.data();
+    FinOpT fin_op{};
+
+    // Create kernel parameter struct. Flip x and y if column major.
+    IdxT ldx    = row_major ? k : m;
+    IdxT ldy    = row_major ? k : n;
+    IdxT ld_out = row_major ? n : m;
+
+    // Template parameters of pairwise_matrix_params are defined in kernel.cuh
+    pairwise_matrix_params kparams{
+      IdxT(m), IdxT(n), IdxT(k), ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, row_major};
+
+    // Run benchmark
+    loop_on_state(state, [&]() { launch_kernel(kparams, grid, stream); });
+
+    // Report metrics. We don't report flop/s because we do not know for each
+    // distance operation how many flops it costs. For L2_unexp and l1, we can
+    // double this number to get the flop/s. For l2 expanded, core_ops/s should
+    // equal flop/s (modulo the sqrt and subtracting from the norm).
+    size_t num_core_ops = m * n * k;
+    size_t read_elts    = n * k + m * k;
+    size_t write_elts   = m * n;
+
+    state.counters["m"]         = benchmark::Counter(m);
+    state.counters["n"]         = benchmark::Counter(n);
+    state.counters["k"]         = benchmark::Counter(k);
+    state.counters["occupancy"] = benchmark::Counter(occupancy);
+    state.counters["# waves"]   = benchmark::Counter(p.num_waves);
+    state.counters["# k iters"] = benchmark::Counter(p.num_k_iters);
+
+    state.counters["core_ops/s"] = benchmark::Counter(num_core_ops,
+                                                      benchmark::Counter::kIsIterationInvariantRate,
+                                                      benchmark::Counter::OneK::kIs1000);
+
+    state.counters["BW"] = benchmark::Counter(write_elts * sizeof(OutT) + read_elts * sizeof(DataT),
+                                              benchmark::Counter::kIsIterationInvariantRate,
+                                              benchmark::Counter::OneK::kIs1000);
+  }
+};
+
+RAFT_BENCH_REGISTER(throughput_bench, "", throughput_params);
+
+}  // namespace raft::bench::distance::tune
diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cu b/cpp/bench/prims/distance/tune_pairwise/kernel.cu
new file mode 100644
index 0000000000..3112e1ea9a
--- /dev/null
+++ b/cpp/bench/prims/distance/tune_pairwise/kernel.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel.cuh"
+#include <raft/distance/detail/pairwise_matrix/kernel_sm60.cuh>  // pairwise_matrix_sm60_wrapper
+#include <raft/linalg/contractions.cuh>                          // raft::linalg::Policy4x4
+#include <raft/util/arch.cuh>  // raft::util::arch::SM_compute_arch
+
+namespace raft::bench::distance::tune {
+
+// Distance op
+using OpT                  = raft::distance::detail::ops::lp_unexp_distance_op<DataT, AccT, IdxT>;
+constexpr float metric_arg = 2.0;
+OpT distance_op{metric_arg};
+
+// Kernel policy
+constexpr int vec_len = 1;
+using Policy          = typename raft::linalg::Policy4x4<DataT, vec_len>::Policy;
+
+// Architecture
+namespace arch                 = raft::util::arch;
+constexpr auto sm_compat_range = arch::SM_range(arch::SM_min(), arch::SM_future());
+
+void launch_kernel(pairwise_matrix_params params, dim3 grid, cudaStream_t stream)
+{
+  dim3 block(Policy::Nthreads);
+  int smem_size = OpT::shared_mem_size<Policy>();
+
+  // Obtain function pointer to kernel
+  auto kernel = raft::distance::detail::pairwise_matrix_kernel<Policy,
+                                                               row_major,
+                                                               decltype(sm_compat_range),
+                                                               OpT,
+                                                               IdxT,
+                                                               DataT,
+                                                               OutT,
+                                                               FinOpT>;
+
+  kernel<<<grid, block, smem_size, stream>>>(distance_op, params);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+void get_block_size(int& m, int& n, int& k)
+{
+  m = Policy::Mblk;
+  n = Policy::Nblk;
+  k = Policy::Kblk;
+}
+
+void* get_kernel_ptr()
+{
+  auto kernel = raft::distance::detail::pairwise_matrix_kernel<Policy,
+                                                               row_major,
+                                                               decltype(sm_compat_range),
+                                                               OpT,
+                                                               IdxT,
+                                                               DataT,
+                                                               OutT,
+                                                               FinOpT>;
+  return reinterpret_cast<void*>(kernel);
+}
+
+int get_max_occupancy()
+{
+  void* kernel_ptr = get_kernel_ptr();
+  int max_occupancy;
+  int smem_size = OpT::shared_mem_size<Policy>();
+
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_occupancy, kernel_ptr, Policy::Nthreads, smem_size));
+
+  return max_occupancy;
+}
+
+}  // namespace raft::bench::distance::tune
diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cuh b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh
new file mode 100644
index 0000000000..5da54a343c
--- /dev/null
+++ b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance_ops/all_ops.cuh>    // lp_unexp_distance_op
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+
+namespace raft::bench::distance::tune {
+
+// Launch one specific kernel with the following template parameters
+constexpr bool row_major = true;
+using DataT              = float;
+using AccT               = float;
+using OutT               = DataT;
+using IdxT               = int;
+
+using FinOpT = raft::identity_op;
+
+using pairwise_matrix_params =
+  raft::distance::detail::pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>;
+
+// Launches kernel
+void launch_kernel(pairwise_matrix_params, dim3, cudaStream_t);
+
+// Describes the block size that is decided by the policy
+void get_block_size(int& m, int& n, int& k);
+
+int get_max_occupancy();
+
+}  // namespace raft::bench::distance::tune
diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/prims/linalg/add.cu
similarity index 96%
rename from cpp/bench/linalg/add.cu
rename to cpp/bench/prims/linalg/add.cu
index 7d00b8cbae..456214ad7b 100644
--- a/cpp/bench/linalg/add.cu
+++ b/cpp/bench/prims/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/prims/linalg/map_then_reduce.cu
similarity index 97%
rename from cpp/bench/linalg/map_then_reduce.cu
rename to cpp/bench/prims/linalg/map_then_reduce.cu
index 33a3e66264..84aebd85bf 100644
--- a/cpp/bench/linalg/map_then_reduce.cu
+++ b/cpp/bench/prims/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/prims/linalg/matrix_vector_op.cu
similarity index 99%
rename from cpp/bench/linalg/matrix_vector_op.cu
rename to cpp/bench/prims/linalg/matrix_vector_op.cu
index aa388955da..d1fbaee79b 100644
--- a/cpp/bench/linalg/matrix_vector_op.cu
+++ b/cpp/bench/prims/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu
similarity index 98%
rename from cpp/bench/linalg/norm.cu
rename to cpp/bench/prims/linalg/norm.cu
index efecee88c9..f83953f8e4 100644
--- a/cpp/bench/linalg/norm.cu
+++ b/cpp/bench/prims/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu
similarity index 98%
rename from cpp/bench/linalg/normalize.cu
rename to cpp/bench/prims/linalg/normalize.cu
index d01473ffeb..ad9052a008 100644
--- a/cpp/bench/linalg/normalize.cu
+++ b/cpp/bench/prims/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/prims/linalg/reduce.cu
similarity index 97%
rename from cpp/bench/linalg/reduce.cu
rename to cpp/bench/prims/linalg/reduce.cu
index 015e0b8abe..cf41c5916a 100644
--- a/cpp/bench/linalg/reduce.cu
+++ b/cpp/bench/prims/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
similarity index 98%
rename from cpp/bench/linalg/reduce_cols_by_key.cu
rename to cpp/bench/prims/linalg/reduce_cols_by_key.cu
index 43aeb69ab0..ac0c612ee4 100644
--- a/cpp/bench/linalg/reduce_cols_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
similarity index 98%
rename from cpp/bench/linalg/reduce_rows_by_key.cu
rename to cpp/bench/prims/linalg/reduce_rows_by_key.cu
index 075bc7c8c4..aa9c9a1f62 100644
--- a/cpp/bench/linalg/reduce_rows_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/main.cpp b/cpp/bench/prims/main.cpp
similarity index 92%
rename from cpp/bench/main.cpp
rename to cpp/bench/prims/main.cpp
index 3162422e8e..40f539facf 100644
--- a/cpp/bench/main.cpp
+++ b/cpp/bench/prims/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu
similarity index 100%
rename from cpp/bench/matrix/argmin.cu
rename to cpp/bench/prims/matrix/argmin.cu
diff --git a/cpp/bench/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
similarity index 100%
rename from cpp/bench/matrix/gather.cu
rename to cpp/bench/prims/matrix/gather.cu
diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
new file mode 100644
index 0000000000..870119db52
--- /dev/null
+++ b/cpp/bench/prims/matrix/select_k.cu
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft_internal/matrix/select_k.cuh>
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#if defined RAFT_COMPILED
+#include <raft/matrix/specializations.cuh>
+#endif
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+namespace raft::matrix {
+
+using namespace raft::bench;  // NOLINT
+
+template <typename KeyT, typename IdxT, select::Algo Algo>
+struct selection : public fixture {
+  explicit selection(const select::params& p)
+    : params_(p),
+      in_dists_(p.batch_size * p.len, stream),
+      in_ids_(p.batch_size * p.len, stream),
+      out_dists_(p.batch_size * p.k, stream),
+      out_ids_(p.batch_size * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream);
+    raft::random::RngState state{42};
+
+    KeyT min_value = -1.0;
+    KeyT max_value = 1.0;
+    if (p.use_same_leading_bits) {
+      if constexpr (std::is_same_v<KeyT, float>) {
+        uint32_t min_bits = 0x3F800000;  // 1.0
+        uint32_t max_bits = 0x3F8000FF;  // 1.00003
+        memcpy(&min_value, &min_bits, sizeof(KeyT));
+        memcpy(&max_value, &max_bits, sizeof(KeyT));
+      } else if constexpr (std::is_same_v<KeyT, double>) {
+        uint64_t min_bits = 0x3FF0000000000000;  // 1.0
+        uint64_t max_bits = 0x3FF0000FFFFFFFFF;  // 1.000015
+        memcpy(&min_value, &min_bits, sizeof(KeyT));
+        memcpy(&max_value, &max_bits, sizeof(KeyT));
+      }
+    }
+    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), min_value, max_value);
+  }
+
+  void run_benchmark(::benchmark::State& state) override  // NOLINT
+  {
+    device_resources handle{stream};
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
+      if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; }
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this, &handle]() {
+        select::select_k_impl<KeyT, IdxT>(handle,
+                                          Algo,
+                                          in_dists_.data(),
+                                          in_ids_.data(),
+                                          params_.batch_size,
+                                          params_.len,
+                                          params_.k,
+                                          out_dists_.data(),
+                                          out_ids_.data(),
+                                          params_.select_min);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const select::params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<select::params> kInputs{
+  {20000, 500, 1, true},
+  {20000, 500, 2, true},
+  {20000, 500, 4, true},
+  {20000, 500, 8, true},
+  {20000, 500, 16, true},
+  {20000, 500, 32, true},
+  {20000, 500, 64, true},
+  {20000, 500, 128, true},
+  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},
+  {1000, 10000, 2, true},
+  {1000, 10000, 4, true},
+  {1000, 10000, 8, true},
+  {1000, 10000, 16, true},
+  {1000, 10000, 32, true},
+  {1000, 10000, 64, true},
+  {1000, 10000, 128, true},
+  {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},
+  {100, 100000, 2, true},
+  {100, 100000, 4, true},
+  {100, 100000, 8, true},
+  {100, 100000, 16, true},
+  {100, 100000, 32, true},
+  {100, 100000, 64, true},
+  {100, 100000, 128, true},
+  {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},
+  {10, 1000000, 2, true},
+  {10, 1000000, 4, true},
+  {10, 1000000, 8, true},
+  {10, 1000000, 16, true},
+  {10, 1000000, 32, true},
+  {10, 1000000, 64, true},
+  {10, 1000000, 128, true},
+  {10, 1000000, 256, true},
+
+  {10, 1000000, 1, true, false, true},
+  {10, 1000000, 2, true, false, true},
+  {10, 1000000, 4, true, false, true},
+  {10, 1000000, 8, true, false, true},
+  {10, 1000000, 16, true, false, true},
+  {10, 1000000, 32, true, false, true},
+  {10, 1000000, 64, true, false, true},
+  {10, 1000000, 128, true, false, true},
+  {10, 1000000, 256, true, false, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, A)                          \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                      \
+  {                                                                \
+    using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
+  }
+
+SELECTION_REGISTER(float, uint32_t, kPublicApi);             // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpAuto);              // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);    // NOLINT
+
+SELECTION_REGISTER(double, uint32_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(double, uint32_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(double, uint32_t, kWarpAuto);              // NOLINT
+
+SELECTION_REGISTER(double, int64_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);    // NOLINT
+
+}  // namespace raft::matrix
diff --git a/cpp/bench/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
similarity index 98%
rename from cpp/bench/neighbors/knn.cuh
rename to cpp/bench/prims/neighbors/knn.cuh
index fe8c2c10d8..6caf355034 100644
--- a/cpp/bench/neighbors/knn.cuh
+++ b/cpp/bench/prims/neighbors/knn.cuh
@@ -24,15 +24,10 @@
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
-#if defined RAFT_NN_COMPILED
-// TODO: Legacy. Remove when FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
index d981104e20..7df0599670 100644
--- a/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
rename to cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
index 60f7edae96..9704d39e76 100644
--- a/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
+++ b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
index 594d4d16d2..fbbb4f9acc 100644
--- a/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
index bd268f036c..7067dbe1b6 100644
--- a/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/refine.cuh b/cpp/bench/prims/neighbors/refine.cuh
similarity index 100%
rename from cpp/bench/neighbors/refine.cuh
rename to cpp/bench/prims/neighbors/refine.cuh
diff --git a/cpp/bench/neighbors/refine_float_int64_t.cu b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/refine_float_int64_t.cu
rename to cpp/bench/prims/neighbors/refine_float_int64_t.cu
index 40ab2bc0ca..43be330e9b 100644
--- a/cpp/bench/neighbors/refine_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
@@ -17,11 +17,8 @@
 #include "refine.cuh"
 #include <common/benchmark.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations/refine.cuh>
-#endif
-
-#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.cuh>
 #endif
 
diff --git a/cpp/bench/neighbors/refine_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
similarity index 80%
rename from cpp/bench/neighbors/refine_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
index 92806f84a7..1d7cb8c8aa 100644
--- a/cpp/bench/neighbors/refine_uint8_t_int64_t.cu
+++ b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
@@ -17,13 +17,8 @@
 #include "refine.cuh"
 #include <common/benchmark.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations/refine.cuh>
-#endif
-
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#if defined RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 using namespace raft::neighbors;
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/prims/random/make_blobs.cu
similarity index 98%
rename from cpp/bench/random/make_blobs.cu
rename to cpp/bench/prims/random/make_blobs.cu
index 950d80c499..f43d914cf2 100644
--- a/cpp/bench/random/make_blobs.cu
+++ b/cpp/bench/prims/random/make_blobs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/prims/random/permute.cu
similarity index 100%
rename from cpp/bench/random/permute.cu
rename to cpp/bench/prims/random/permute.cu
diff --git a/cpp/bench/random/rng.cu b/cpp/bench/prims/random/rng.cu
similarity index 98%
rename from cpp/bench/random/rng.cu
rename to cpp/bench/prims/random/rng.cu
index 147adf26ae..d15c9441d7 100644
--- a/cpp/bench/random/rng.cu
+++ b/cpp/bench/prims/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu
similarity index 100%
rename from cpp/bench/sparse/convert_csr.cu
rename to cpp/bench/prims/sparse/convert_csr.cu
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 5e68ca5bc4..c733d46985 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -21,6 +21,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
 endif()
 
+if(CUDA_LOG_COMPILE_TIME)
+  list(APPEND RAFT_CUDA_FLAGS "--time=nvcc_compile_log.csv")
+endif()
+
 list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 list(APPEND RAFT_CXX_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM")
 list(APPEND RAFT_CUDA_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM")
diff --git a/cpp/cmake/modules/FindAVX.cmake b/cpp/cmake/modules/FindAVX.cmake
new file mode 100644
index 0000000000..7f3b2dfc76
--- /dev/null
+++ b/cpp/cmake/modules/FindAVX.cmake
@@ -0,0 +1,110 @@
+# =============================================================================
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+#
+# Note: This file was copied from PyTorch and modified for use in the RAFT library.
+# Refer to thirdparty/LICENSES/LICENSE.pytorch for license and additional
+# copyright information.
+# =============================================================================
+
+INCLUDE(CheckCXXSourceRuns)
+
+SET(AVX_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+"
+)
+
+SET(AVX512_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i b = a;
+    __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+    return 0;
+  }
+"
+)
+
+SET(AVX2_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256i a = {0};
+    a = _mm256_abs_epi16(a);
+    __m256i x;
+    _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+    return 0;
+  }
+"
+)
+
+MACRO(CHECK_SSE lang type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${lang}_${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      IF(${lang}_HAS_${type}_${__FLAG_I})
+        SET(${lang}_${type}_FOUND
+            TRUE
+            CACHE BOOL "${lang} ${type} support"
+        )
+        SET(${lang}_${type}_FLAGS
+            "${__FLAG}"
+            CACHE STRING "${lang} ${type} flags"
+        )
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${lang}_${type}_FOUND)
+    SET(${lang}_${type}_FOUND
+        FALSE
+        CACHE BOOL "${lang} ${type} support"
+    )
+    SET(${lang}_${type}_FLAGS
+        ""
+        CACHE STRING "${lang} ${type} flags"
+    )
+  ENDIF()
+
+  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
+
+ENDMACRO()
+
+# CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX") CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2") CHECK_SSE(C
+# "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
+#
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+CHECK_SSE(CXX "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index bcc3578bf8..0a43f9451c 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -213,7 +213,7 @@ function(raft_export type project_name)
       DESTINATION "${install_location}"
       COMPONENT raft
     )
-    foreach(comp nn distance)
+    foreach(comp compiled)
       set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export/${comp}/")
       file(MAKE_DIRECTORY "${scratch_dir}")
       install(
diff --git a/cpp/cmake/patches/ggnn.patch b/cpp/cmake/patches/ggnn.patch
new file mode 100644
index 0000000000..95e1aaff4b
--- /dev/null
+++ b/cpp/cmake/patches/ggnn.patch
@@ -0,0 +1,206 @@
+diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+index 8cbaf0d..6eb72ac 100644
+--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
++++ b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+@@ -41,7 +41,6 @@ limitations under the License.
+ #include "ggnn/sym/cuda_knn_sym_query_layer.cuh"
+ #include "ggnn/utils/cuda_knn_utils.cuh"
+ #include "ggnn/utils/cuda_knn_constants.cuh"
+-#include "ggnn/utils/cuda_knn_dataset.cuh"
+ 
+ template <typename ValueT>
+ __global__ void divide(ValueT* res, ValueT* input, ValueT N) {
+@@ -98,9 +97,7 @@ struct GGNNGPUInstance {
+   typedef GGNNGraphDevice<KeyT, BaseT, ValueT> GGNNGraphDevice;
+   typedef GGNNGraphHost<KeyT, BaseT, ValueT> GGNNGraphHost;
+ 
+-  const Dataset<KeyT, BaseT, BAddrT>* dataset;
+   GGNNGraphBuffer<KeyT, ValueT>* ggnn_buffer {nullptr};
+-  GGNNQuery<KeyT, ValueT, BaseT> ggnn_query;
+ 
+   // Graph Shards resident on the GPU
+   std::vector<GGNNGraphDevice> ggnn_shards;
+@@ -117,13 +114,12 @@ struct GGNNGPUInstance {
+   // number of shards that need to be processed by this instance
+   const int num_parts;
+ 
+-  GGNNGPUInstance(const int gpu_id, const Dataset<KeyT, BaseT, BAddrT>* dataset,
++  GGNNGPUInstance(const int gpu_id,
+             const int N_shard, const int L,
+             const bool enable_construction, const float tau_build,
+             const int num_parts=1, const int num_cpu_buffers=1) :
+     N_shard{N_shard}, L{L}, tau_build{tau_build},
+-    dataset{dataset}, gpu_id{gpu_id},
+-    ggnn_query{dataset->N_query, D, KQuery, num_parts},
++    gpu_id{gpu_id},
+     num_parts{num_parts}
+   {
+     CHECK_LE(L, MAX_LAYER);
+@@ -135,7 +131,6 @@ struct GGNNGPUInstance {
+       CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!";
+     }
+ 
+-    ggnn_query.loadQueriesAsync(dataset->h_query, 0);
+ 
+     computeGraphParameters();
+ 
+@@ -186,7 +181,7 @@ struct GGNNGPUInstance {
+   }
+ 
+   GGNNGPUInstance(const GGNNGPUInstance& other)
+-   : dataset{nullptr}, ggnn_query{0, D, KQuery},
++   :
+      gpu_id{0}, N_shard{0}, num_parts{0} {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+@@ -305,6 +300,7 @@ struct GGNNGPUInstance {
+ 
+   // io
+ 
++  /*
+   void waitForDiskIO(const int shard_id) {
+     auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
+     if (cpu_buffer.disk_io_thread.joinable())
+@@ -468,11 +464,12 @@ struct GGNNGPUInstance {
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
++  */
+ 
+   // graph operations
+ 
+   template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, bool DIST_STATS = false>
+-  void queryLayer(const int shard_id = 0) const {
++  void queryLayer(const BaseT* d_query, int batch_size, KeyT* d_query_result_ids, ValueT* d_query_result_dists, const int shard_id = 0) const {
+     CHECK_CUDA(cudaSetDevice(gpu_id));
+     const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
+ 
+@@ -482,21 +479,21 @@ struct GGNNGPUInstance {
+ 
+     int* m_dist_statistics = nullptr;
+     if (DIST_STATS)
+-      cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int));
++      cudaMallocManaged(&m_dist_statistics, batch_size * sizeof(int));
+ 
+     QueryKernel query_kernel;
+     query_kernel.d_base = shard.d_base;
+-    query_kernel.d_query = ggnn_query.d_query;
++    query_kernel.d_query = d_query;
+ 
+     query_kernel.d_graph = shard.d_graph;
+-    query_kernel.d_query_results = ggnn_query.d_query_result_ids;
+-    query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists;
++    query_kernel.d_query_results = d_query_result_ids;
++    query_kernel.d_query_results_dists = d_query_result_dists;
+ 
+     query_kernel.d_translation = shard.d_translation;
+ 
+     query_kernel.d_nn1_stats = shard.d_nn1_stats;
+ 
+-    query_kernel.N = dataset->N_query;
++    query_kernel.N = batch_size;
+     query_kernel.N_offset = 0;
+ 
+     query_kernel.d_dist_stats = m_dist_statistics;
+@@ -771,6 +768,16 @@ struct GGNNGPUInstance {
+       sym(layer, shard_id);
+     }
+   }
++
++  void set_stream(cudaStream_t stream) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).stream = stream;
++  }
++
++  void set_base_data(const BaseT* dataset) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).d_base = dataset;
++  }
+ };
+ 
+ #endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+index c94a8f1..781226d 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+@@ -50,7 +50,7 @@ struct GGNNGraphDevice {
+   ValueT* d_nn1_stats;
+ 
+   /// base data pointer for the shard.
+-  BaseT* d_base;
++  const BaseT* d_base;
+ 
+   /// combined memory pool
+   char* d_memory;
+@@ -69,7 +69,9 @@ struct GGNNGraphDevice {
+     const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
+     const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
+     total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
+-    base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    // base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    (void) N;
++    (void) D;
+ 
+     const size_t total_size = base_size+total_graph_size;
+ 
+@@ -86,8 +88,7 @@ struct GGNNGraphDevice {
+     CHECK_CUDA(cudaMalloc(&d_memory, total_size));
+ 
+     size_t pos = 0;
+-    d_base = reinterpret_cast<BaseT*>(d_memory+pos);
+-    pos += base_size;
++    d_base = nullptr;
+     d_graph = reinterpret_cast<KeyT*>(d_memory+pos);
+     pos += graph_size;
+     d_translation = reinterpret_cast<KeyT*>(d_memory+pos);
+@@ -99,14 +100,14 @@ struct GGNNGraphDevice {
+ 
+     CHECK_EQ(pos, total_size);
+ 
+-    CHECK_CUDA(cudaStreamCreate(&stream));
++    // CHECK_CUDA(cudaStreamCreate(&stream));
+ 
+     CHECK_CUDA(cudaPeekAtLastError());
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphDevice(const GGNNGraphDevice& other) {
++  GGNNGraphDevice(const GGNNGraphDevice&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+@@ -116,7 +117,7 @@ struct GGNNGraphDevice {
+   ~GGNNGraphDevice() {
+     cudaFree(d_memory);
+ 
+-    CHECK_CUDA(cudaStreamDestroy(stream));
++    // CHECK_CUDA(cudaStreamDestroy(stream));
+   }
+ };
+ 
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+index 2055f9e..ef5843a 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+@@ -92,7 +92,7 @@ struct GGNNGraphHost {
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphHost(const GGNNGraphHost& other) {
++  GGNNGraphHost(const GGNNGraphHost&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+index 49d76a1..eef69e6 100644
+--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
++++ b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+@@ -22,7 +22,6 @@ limitations under the License.
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ 
+-#include <gflags/gflags.h>
+ #include <cub/cub.cuh>
+ 
+ #include "ggnn/utils/cuda_knn_constants.cuh"
diff --git a/cpp/cmake/patches/nlohmann_json.patch b/cpp/cmake/patches/nlohmann_json.patch
new file mode 100644
index 0000000000..83dd56bc16
--- /dev/null
+++ b/cpp/cmake/patches/nlohmann_json.patch
@@ -0,0 +1,38 @@
+--- nlohmann/json.hpp	2021-05-06 11:40:39.770669693 +0800
++++ nlohmann/json_patched.hpp	2021-06-02 18:46:43.849334466 +0800
+@@ -16607,6 +16607,21 @@
+         }
+     }
+ 
++
++    template <typename NumberType,
++  	    enable_if_t<std::is_signed<NumberType>::value, int> = 0>
++    bool is_negative_number(NumberType x)
++    {
++        return x < 0;
++    }
++
++    template < typename NumberType,
++  	     enable_if_t < std::is_unsigned<NumberType>::value, int > = 0 >
++    bool is_negative_number(NumberType /*unused*/)
++    {
++        return false;
++    }
++
+     /*!
+     @brief dump an integer
+ 
+@@ -16649,12 +16664,11 @@
+         // use a pointer to fill the buffer
+         auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+ 
+-        const bool is_negative = std::is_same<NumberType, number_integer_t>::value && !(x >= 0); // see issue #755
+         number_unsigned_t abs_value;
+ 
+         unsigned int n_chars{};
+ 
+-        if (is_negative)
++        if (is_negative_number(x))
+         {
+             *buffer_ptr = '-';
+             abs_value = remove_sign(static_cast<number_integer_t>(x));
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
index 3e02ce064e..cb809de445 100644
--- a/cpp/cmake/thirdparty/get_cutlass.cmake
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -61,32 +61,19 @@ function(find_and_configure_cutlass)
   # We generate the cutlass-config files when we built cutlass locally, so always do
   # `find_dependency`
   rapids_export_package(
-          BUILD NvidiaCutlass raft-distance-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
+          BUILD NvidiaCutlass raft-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
   )
   rapids_export_package(
-          INSTALL NvidiaCutlass raft-distance-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
-  )
-  rapids_export_package(
-          BUILD NvidiaCutlass raft-nn-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
-  )
-  rapids_export_package(
-          INSTALL NvidiaCutlass raft-nn-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
+          INSTALL NvidiaCutlass raft-exports GLOBAL_TARGETS nvidia::cutlass::cutlass
   )
 
   # Tell cmake where it can find the generated NvidiaCutlass-config.cmake we wrote.
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
   rapids_export_find_package_root(
-          INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] raft-distance-exports
-  )
-  rapids_export_find_package_root(
-          BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-distance-exports
-  )
-  include("${rapids-cmake-dir}/export/find_package_root.cmake")
-  rapids_export_find_package_root(
-          INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] raft-nn-exports
+          INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] raft-exports
   )
   rapids_export_find_package_root(
-          BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-nn-exports
+          BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-exports
   )
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index e6f06a00a5..b7c132f2f1 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,73 +17,71 @@
 function(find_and_configure_faiss)
     set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN} )
+            "${multiValueArgs}" ${ARGN} )
 
-    if(RAFT_ENABLE_NN_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES)
-      rapids_find_generate_module(faiss
-          HEADER_NAMES  faiss/IndexFlat.h
-          LIBRARY_NAMES faiss
-      )
+        rapids_find_generate_module(faiss
+                HEADER_NAMES  faiss/IndexFlat.h
+                LIBRARY_NAMES faiss
+                )
 
-      set(BUILD_SHARED_LIBS ON)
-      if (PKG_BUILD_STATIC_LIBS)
-        set(BUILD_SHARED_LIBS OFF)
-        set(CPM_DOWNLOAD_faiss ON)
-      endif()
+        set(BUILD_SHARED_LIBS ON)
+        if (PKG_BUILD_STATIC_LIBS)
+            set(BUILD_SHARED_LIBS OFF)
+            set(CPM_DOWNLOAD_faiss ON)
+        endif()
 
-      rapids_cpm_find(faiss ${PKG_VERSION}
-          GLOBAL_TARGETS     faiss::faiss
-          CPM_ARGS
-            GIT_REPOSITORY   ${PKG_REPOSITORY}
-            GIT_TAG          ${PKG_PINNED_TAG}
-            EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
-            OPTIONS
-              "FAISS_ENABLE_PYTHON OFF"
-              "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
-              "FAISS_ENABLE_GPU ON"
-              "BUILD_TESTING OFF"
-              "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
-              "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
-      )
+        rapids_cpm_find(faiss ${PKG_VERSION}
+                GLOBAL_TARGETS     faiss::faiss
+                CPM_ARGS
+                GIT_REPOSITORY   ${PKG_REPOSITORY}
+                GIT_TAG          ${PKG_PINNED_TAG}
+                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
+                OPTIONS
+                "FAISS_ENABLE_PYTHON OFF"
+                "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
+                "FAISS_ENABLE_GPU ON"
+                "BUILD_TESTING OFF"
+                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+                )
 
-      if(TARGET faiss AND NOT TARGET faiss::faiss)
-          add_library(faiss::faiss ALIAS faiss)
-      endif()
+        if(TARGET faiss AND NOT TARGET faiss::faiss)
+            add_library(faiss::faiss ALIAS faiss)
+        endif()
 
-      if(faiss_ADDED)
-        rapids_export(BUILD faiss
-            EXPORT_SET faiss-targets
-            GLOBAL_TARGETS faiss
-            NAMESPACE faiss::)
-      endif()
-    endif()
+        if(faiss_ADDED)
+            rapids_export(BUILD faiss
+                    EXPORT_SET faiss-targets
+                    GLOBAL_TARGETS faiss
+                    NAMESPACE faiss::)
+        endif()
 
     # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
-    rapids_export_package(BUILD OpenMP raft-nn-lib-exports) # faiss uses openMP but doesn't export a need for it
-    rapids_export_package(BUILD faiss raft-nn-lib-exports GLOBAL_TARGETS faiss::faiss faiss)
-    rapids_export_package(INSTALL faiss raft-nn-lib-exports GLOBAL_TARGETS faiss::faiss faiss)
+    rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it
+    rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss)
+    rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss)
 
     # Tell cmake where it can find the generated faiss-config.cmake we wrote.
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-nn-lib-exports)
+    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-ann-bench-exports)
 endfunction()
 
 if(NOT RAFT_FAISS_GIT_TAG)
-  # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-  # (https://github.com/facebookresearch/faiss/pull/2446)
-  set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0)
-  # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0)
+    # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
 endif()
 
 if(NOT RAFT_FAISS_GIT_REPOSITORY)
-  # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-  # (https://github.com/facebookresearch/faiss/pull/2446)
-  set(RAFT_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git)
-  # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(RAFT_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git)
+    # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
 endif()
 
 find_and_configure_faiss(VERSION    1.7.0
-                         REPOSITORY  ${RAFT_FAISS_GIT_REPOSITORY}
-                         PINNED_TAG  ${RAFT_FAISS_GIT_TAG}
-                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
-                         EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL})
+        REPOSITORY  ${RAFT_FAISS_GIT_REPOSITORY}
+        PINNED_TAG  ${RAFT_FAISS_GIT_TAG}
+        BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
+        EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL})
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
new file mode 100644
index 0000000000..708acb6b8d
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -0,0 +1,44 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ggnn)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ )
+    if (NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/ggnn-src/)
+
+        execute_process (
+                COMMAND git clone "https://github.com/${PKG_FORK}/ggnn" --branch ${PKG_PINNED_TAG} ggnn-src
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ )
+
+        message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}")
+        execute_process (
+                COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.patch
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src
+        )
+    endif()
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_ggnn(VERSION          0.5
+        FORK             cgtuebingen
+        PINNED_TAG       release_0.5
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake
new file mode 100644
index 0000000000..9334224de5
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_glog.cmake
@@ -0,0 +1,49 @@
+#=============================================================================
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_glog)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(glog ${PKG_VERSION}
+            GLOBAL_TARGETS      glog::glog
+            BUILD_EXPORT_SET    raft-exports
+            INSTALL_EXPORT_SET  raft-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/glog.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR          cpp
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            )
+
+    if(glog_ADDED)
+        message(VERBOSE "RAFT: Using glog located in ${glog_SOURCE_DIR}")
+    else()
+        message(VERBOSE "RAFT: Using glog located in ${glog_DIR}")
+    endif()
+
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_glog_SOURCE=/path/to/local/glog
+find_and_configure_glog(VERSION 0.6.0
+        FORK             google
+        PINNED_TAG       v0.6.0
+        EXCLUDE_FROM_ALL ON
+        )
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
new file mode 100644
index 0000000000..94033e8333
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -0,0 +1,49 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_hnswlib)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} )
+    if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src )
+
+        execute_process (
+                COMMAND git clone --branch=v0.6.2 https://github.com/nmslib/hnswlib.git hnswlib-src
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
+
+    endif ()
+
+    include(cmake/modules/FindAVX.cmake)
+
+    set(HNSW_CXX_FLAGS "")
+    if(CXX_AVX_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX_FLAGS}")
+    elseif(CXX_AVX2_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX2_FLAGS}")
+    elseif(CXX_AVX512_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX512_FLAGS}")
+    endif()
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_hnswlib(VERSION  0.6.2
+        FORK             nmslib
+        PINNED_TAG       v0.6.2
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
new file mode 100644
index 0000000000..5de98a47ce
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
@@ -0,0 +1,39 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_nlohmann_json)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(nlohmann_json ${PKG_VERSION}
+            GLOBAL_TARGETS      nlohmann_json::nlohmann_json
+            BUILD_EXPORT_SET    raft-bench-ann-exports
+            INSTALL_EXPORT_SET  raft-bench-ann-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/json.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL})
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_nlohmann_json(VERSION  3.11.2
+        FORK             nlohmann
+        PINNED_TAG       v3.11.2
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 12360b9482..6e37aab40d 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -21,6 +21,4 @@ function(find_and_configure_thrust)
                            INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
-if(RAFT_ENABLE_thrust_DEPENDENCY)
-  find_and_configure_thrust()
-endif()
+find_and_configure_thrust()
diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index a9d8777304..bb1d122a24 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -383,8 +383,8 @@ static int chooseNewCentroid(raft::device_resources const& handle,
                          thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
-  CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(
+  RAFT_CHECK_CUDA(stream);
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
@@ -523,7 +523,7 @@ static int initializeCentroids(raft::device_resources const& handle,
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
-    CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
     computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
       n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
     RAFT_CHECK_CUDA(stream);
@@ -534,7 +534,7 @@ static int initializeCentroids(raft::device_resources const& handle,
   }
 
   // Compute cluster sizes
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
   RAFT_CHECK_CUDA(stream);
 
@@ -598,7 +598,7 @@ static int assignCentroids(raft::device_resources const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
@@ -606,7 +606,7 @@ static int assignCentroids(raft::device_resources const& handle,
   gridDim.y  = 1;
   gridDim.z  = 1;
   minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Compute residual sum of squares
   *residual_host = thrust::reduce(
@@ -825,8 +825,8 @@ int kmeans(raft::device_resources const& handle,
 
   // Trivial cases
   if (k == 1) {
-    CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(
+    RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+    RAFT_CUDA_TRY(
       cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
     if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
@@ -837,7 +837,7 @@ int kmeans(raft::device_resources const& handle,
                  1,
                  std::min(ceildiv<unsigned>(n, BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
-    CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
     computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
     RAFT_CHECK_CUDA(stream);
     *residual_host = thrust::reduce(
diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
index 8d3321eb77..192d160d45 100644
--- a/cpp/include/raft/core/kvp.hpp
+++ b/cpp/include/raft/core/kvp.hpp
@@ -20,7 +20,7 @@
 
 #ifdef _RAFT_HAS_CUDA
 #include <cub/cub.cuh>
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_utils.cuh>  // raft::shfl_xor
 #endif
 namespace raft {
 /**
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index f469250b45..7493c4e558 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -16,25 +16,18 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
-#include <type_traits>
-
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/reduce.cuh>
-#include <raft/linalg/unary_op.cuh>
-
 #include <raft/core/operators.hpp>
-
+#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/distance/detail/distance_ops/all_ops.cuh>
 #include <raft/distance/detail/pairwise_matrix/dispatch.cuh>
-
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
-#include <raft/util/arch.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <rmm/device_uvector.hpp>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <type_traits>
 
 namespace raft {
 namespace distance {
@@ -140,14 +133,14 @@ void distance_impl(raft::resources const& handle,
 
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
-  AccT* norm_col_vec    = workspace;
-  AccT* norm_row_vec    = workspace;
-  AccT* sq_norm_col_vec = workspace;
-  AccT* sq_norm_row_vec = workspace;
+  AccT* x_norm    = workspace;
+  AccT* y_norm    = workspace;
+  AccT* sq_x_norm = workspace;
+  AccT* sq_y_norm = workspace;
   if (x != y) {
-    norm_row_vec += m;
+    y_norm += m;
 
-    raft::linalg::reduce(norm_col_vec,
+    raft::linalg::reduce(x_norm,
                          x,
                          k,
                          m,
@@ -158,7 +151,7 @@ void distance_impl(raft::resources const& handle,
                          false,
                          raft::identity_op(),
                          raft::add_op());
-    raft::linalg::reduce(norm_row_vec,
+    raft::linalg::reduce(y_norm,
                          y,
                          k,
                          n,
@@ -170,12 +163,12 @@ void distance_impl(raft::resources const& handle,
                          raft::identity_op(),
                          raft::add_op());
 
-    sq_norm_col_vec += (m + n);
-    sq_norm_row_vec = sq_norm_col_vec + m;
-    raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
-    raft::linalg::rowNorm(sq_norm_row_vec, y, k, n, raft::linalg::L2Norm, is_row_major, stream);
+    sq_x_norm += (m + n);
+    sq_y_norm = sq_x_norm + m;
+    raft::linalg::rowNorm(sq_x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
+    raft::linalg::rowNorm(sq_y_norm, y, k, n, raft::linalg::L2Norm, is_row_major, stream);
   } else {
-    raft::linalg::reduce(norm_col_vec,
+    raft::linalg::reduce(x_norm,
                          x,
                          k,
                          m,
@@ -186,15 +179,15 @@ void distance_impl(raft::resources const& handle,
                          false,
                          raft::identity_op(),
                          raft::add_op());
-    sq_norm_col_vec += m;
-    sq_norm_row_vec = sq_norm_col_vec;
-    raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
+    sq_x_norm += m;
+    sq_y_norm = sq_x_norm;
+    raft::linalg::rowNorm(sq_x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
   }
 
   using OpT = ops::correlation_distance_op<DataT, AccT, IdxT>;
-  OpT corr_op(is_row_major, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
+  OpT corr_op(is_row_major, sq_x_norm, sq_y_norm, m, n, k);
   pairwise_matrix_dispatch<decltype(corr_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    corr_op, m, n, k, x, y, norm_col_vec, norm_row_vec, out, fin_op, stream, is_row_major);
+    corr_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
@@ -223,22 +216,22 @@ void distance_impl(raft::resources const& handle,
 
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
-  DataT* norm_A = workspace;
-  DataT* norm_B = workspace;
+  DataT* x_norm = workspace;
+  DataT* y_norm = workspace;
   if (x != y) {
-    norm_B += m;
+    y_norm += m;
     raft::linalg::rowNorm(
-      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+      x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
     raft::linalg::rowNorm(
-      norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+      y_norm, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
   } else {
     raft::linalg::rowNorm(
-      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+      x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
   }
 
   ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
   pairwise_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
@@ -389,10 +382,6 @@ void distance_impl(raft::resources const& handle,
     return (!x_zero) * raft::exp(input);
   };
 
-  // This op takes some shortcuts when x equals y. So its behavior changes based
-  // on this.
-  ops::kl_divergence_op<DataT, AccT, DataT> kl_divergence{is_row_major, x == y};
-
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
       (DataT*)y, y, n * k, unaryOp_lambda, stream);
@@ -401,8 +390,12 @@ void distance_impl(raft::resources const& handle,
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
 
-  pairwise_matrix_dispatch<decltype(kl_divergence), DataT, AccT, OutT, FinOpT, IdxT>(
-    kl_divergence, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+  // This op takes some shortcuts when x equals y. So its behavior changes based
+  // on this.
+  ops::kl_divergence_op<DataT, AccT, IdxT> distance_op{is_row_major, x == y};
+
+  pairwise_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 
   if (x != y) {
     // Now reverse previous log (x) back to x using (e ^ log(x))
@@ -464,22 +457,22 @@ void distance_impl_l2_expanded(  // NOTE: different name
          "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
-  DataT* norm_A = workspace;
-  DataT* norm_B = workspace;
+  DataT* x_norm = workspace;
+  DataT* y_norm = workspace;
   if (x != y) {
-    norm_B += m;
+    y_norm += m;
     raft::linalg::rowNorm(
-      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+      x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
     raft::linalg::rowNorm(
-      norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+      y_norm, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
   } else {
     raft::linalg::rowNorm(
-      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+      x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
   }
 
   ops::l2_exp_distance_op<DataT, AccT, IdxT> distance_op{perform_sqrt};
   pairwise_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
@@ -543,13 +536,13 @@ void distance_impl(raft::resources const& handle,
   ops::l2_unexp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
 
   // The unexpanded L2 does not require the norms of a and b to be calculated.
-  const DataT* norm_A = nullptr;
-  const DataT* norm_B = nullptr;
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
   pairwise_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    l2_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
@@ -571,13 +564,13 @@ void distance_impl(raft::resources const& handle,
   ops::l2_unexp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
 
   // The unexpanded L2 does not require the norms of a and b to be calculated.
-  const DataT* norm_A = nullptr;
-  const DataT* norm_B = nullptr;
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
   pairwise_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    l2_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
index 930294ce31..eaf37b7e9c 100644
--- a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/core/operators.hpp>            // raft::abs
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -42,7 +43,7 @@ struct canberra_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
index 289b69070a..4fc4bb8297 100644
--- a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -61,7 +61,7 @@ struct correlation_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
index 7c37c27b4e..0883136c9f 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -26,7 +26,7 @@ struct cosine_cutlass_op {
   __device__ cosine_cutlass_op() noexcept {}
   __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
   {
-    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
+    return static_cast<AccT>(1.0) - static_cast<AccT>(accVal / (aNorm * bNorm));
   }
   __device__ AccT operator()(DataT aData) const noexcept { return aData; }
 };
@@ -53,7 +53,7 @@ struct cosine_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
@@ -76,7 +76,10 @@ struct cosine_distance_op {
     }
   }
 
-  cosine_cutlass_op<DataT, AccT> get_cutlass_op() { return cosine_cutlass_op<DataT, AccT>(); }
+  constexpr cosine_cutlass_op<DataT, AccT> get_cutlass_op() const
+  {
+    return cosine_cutlass_op<DataT, AccT>();
+  }
 };
 
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh b/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh
index d3eb90467b..7a4fe0ce83 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <type_traits>
+#include <type_traits>  // std::false_type
+#include <utility>      // std::declval
 
 namespace raft::distance::detail::ops {
 
@@ -34,7 +35,8 @@ struct has_cutlass_op : std::false_type {
 
 // Specialization recognizes types that do support CUTLASS
 template <typename T>
-struct has_cutlass_op<T, std::void_t<decltype(&T::get_cutlass_op)>> : std::true_type {
+struct has_cutlass_op<T, std::void_t<decltype(std::declval<T>().get_cutlass_op())>>
+  : std::true_type {
 };
 
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
index 1cfdcfdc73..475b8892e9 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -45,7 +45,7 @@ struct hamming_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
index c4aecc7a6f..0489b45854 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -42,7 +42,7 @@ struct hellinger_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
index 41eeb9dd83..e46c63734c 100644
--- a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
@@ -15,7 +15,8 @@
  */
 
 #pragma once
-#include <raft/util/cuda_utils.cuh>
+#include <raft/core/operators.hpp>            // raft::log
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -44,7 +45,7 @@ struct jensen_shannon_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
index d046b62c30..d083c5ddcc 100644
--- a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
@@ -15,7 +15,8 @@
  */
 
 #pragma once
-#include <raft/util/cuda_utils.cuh>
+#include <raft/core/operators.hpp>            // raft::log
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -49,7 +50,7 @@ struct kl_divergence_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index 8ec4000827..7e86fd3603 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -41,7 +41,7 @@ struct l1_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index 2a7af53813..95577fd311 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -54,7 +54,7 @@ struct l2_exp_distance_op {
   using AccT  = AccType;
   using IdxT  = IdxType;
 
-  bool sqrt;
+  const bool sqrt;
 
   l2_exp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {}
 
@@ -67,7 +67,7 @@ struct l2_exp_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
@@ -102,7 +102,10 @@ struct l2_exp_distance_op {
     }
   }
 
-  l2_exp_cutlass_op<DataT, AccT> get_cutlass_op() { return l2_exp_cutlass_op<DataT, AccT>(sqrt); }
+  constexpr l2_exp_cutlass_op<DataT, AccT> get_cutlass_op() const
+  {
+    return l2_exp_cutlass_op<DataT, AccT>(sqrt);
+  }
 };
 
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
index f0ea591eaf..62c212ee8f 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -46,7 +46,7 @@ struct l2_unexp_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
index fb21fb1a21..88853a3083 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -42,7 +42,7 @@ struct l_inf_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
index 71dfd51a6e..290f4af1b4 100644
--- a/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
@@ -15,7 +15,8 @@
  */
 
 #pragma once
-#include <raft/util/cuda_utils.cuh>
+#include <raft/core/operators.hpp>            // raft::pow, raft::abs
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -45,7 +46,7 @@ struct lp_unexp_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
index ea09e4d1db..63dbf350d1 100644
--- a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -47,7 +47,7 @@ struct russel_rao_distance_op {
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size()
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index 6998f3cad4..4320068361 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft::distance::detail::ops {
 
@@ -42,8 +42,8 @@ struct template_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
-  constexpr size_t shared_mem_size()
+  template <typename Policy>
+  static constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + TODO;
   }
@@ -59,6 +59,10 @@ struct template_distance_op {
   {
     TODO;
   }
+
+  // If exist, returns a cutlass op that performs the same operation.
+  // See cosine and l2_exp distance ops for an example.
+  constexpr l2_exp_cutlass_op<DataT, AccT> get_cutlass_op() const { TODO; }
 };
 
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 8fbd7a9c69..be6fed9f10 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -16,23 +16,20 @@
 
 #pragma once
 
-#include <limits>
-#include <raft/core/kvp.hpp>
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <stdint.h>
+#include <cstddef>                                          // size_t
+#include <limits>                                           // std::numeric_limits
+#include <raft/core/kvp.hpp>                                // raft::KeyValuePair
+#include <raft/core/operators.hpp>                          // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>     // ops::l2_exp_distance_op
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/linalg/contractions.cuh>                     // Policy
+#include <raft/util/cuda_utils.cuh>                         // raft::ceildiv, raft::shfl
 
 namespace raft {
 namespace distance {
 
 namespace detail {
 
-#if (ENABLE_MEMCPY_ASYNC == 1)
-#include <cuda_pipeline.h>
-using namespace nvcuda::experimental;
-#endif
-
 template <typename LabelT, typename DataT>
 struct KVPMinReduceImpl {
   typedef raft::KeyValuePair<LabelT, DataT> KVP;
@@ -124,11 +121,10 @@ DI void updateReducedVal(
 template <typename DataT,
           typename OutT,
           typename IdxT,
-          bool Sqrt,
           typename P,
           typename ReduceOpT,
           typename KVPReduceOpT,
-          typename CoreLambda,
+          typename OpT,
           typename FinalLambda>
 __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
                                                                   const DataT* x,
@@ -142,7 +138,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
                                                                   int* mutex,
                                                                   ReduceOpT redOp,
                                                                   KVPReduceOpT pairRedOp,
-                                                                  CoreLambda core_op,
+                                                                  OpT distance_op,
                                                                   FinalLambda fin_op)
 {
   extern __shared__ char smem[];
@@ -163,24 +159,6 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
                          IdxT gridStrideY) {
     KVPReduceOpT pairRed_op(pairRedOp);
 
-#pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < P::AccColsPerTh; ++j) {
-        acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
-      }
-    }
-    if (Sqrt) {
-#pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < P::AccColsPerTh; ++j) {
-          auto acc_ij = acc[i][j];
-          acc[i][j]   = acc_ij > DataT{0} ? raft::sqrt(acc_ij) : DataT{0};
-        }
-      }
-    }
-
     // intra thread reduce
     const auto acccolid = threadIdx.x % P::AccThCols;
     const auto accrowid = threadIdx.x / P::AccThCols;
@@ -229,18 +207,18 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
     };
 
   IdxT lda = k, ldb = k, ldd = n;
-  PairwiseDistances<true,
-                    DataT,
-                    DataT,
-                    DataT,
+  constexpr bool row_major = true;
+  constexpr bool write_out = false;
+  PairwiseDistances<DataT,
+                    DataT,  // OutT (unused in PairwiseDistances)
                     IdxT,
                     P,
-                    CoreLambda,
+                    decltype(distance_op),
                     decltype(epilog_lambda),
                     FinalLambda,
                     decltype(rowEpilog_lambda),
-                    true,
-                    false>
+                    row_major,
+                    write_out>
     obj(x,
         y,
         m,
@@ -251,9 +229,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
         ldd,
         xn,
         yn,
-        nullptr,
+        nullptr,  // Output pointer
         smem,
-        core_op,
+        distance_op,
         epilog_lambda,
         fin_op,
         rowEpilog_lambda);
@@ -289,9 +267,6 @@ void fusedL2NNImpl(OutT* min,
   constexpr auto maxVal = std::numeric_limits<DataT>::max();
   typedef KeyValuePair<IdxT, DataT> KVPair;
 
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
-
   RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   if (initOutBuffer) {
     initKernel<DataT, OutT, IdxT, ReduceOpT>
@@ -300,59 +275,25 @@ void fusedL2NNImpl(OutT* min,
   }
 
   constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
-  if (sqrt) {
-    auto fusedL2NNSqrt = fusedL2NNkernel<DataT,
-                                         OutT,
-                                         IdxT,
-                                         true,
-                                         P,
-                                         ReduceOpT,
-                                         KVPReduceOpT,
-                                         decltype(core_lambda),
-                                         raft::identity_op>;
-    dim3 grid          = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
-
-    fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(min,
-                                                    x,
-                                                    y,
-                                                    xn,
-                                                    yn,
-                                                    m,
-                                                    n,
-                                                    k,
-                                                    maxVal,
-                                                    workspace,
-                                                    redOp,
-                                                    pairRedOp,
-                                                    core_lambda,
-                                                    raft::identity_op{});
-  } else {
-    auto fusedL2NN = fusedL2NNkernel<DataT,
-                                     OutT,
-                                     IdxT,
-                                     false,
-                                     P,
-                                     ReduceOpT,
-                                     KVPReduceOpT,
-                                     decltype(core_lambda),
-                                     raft::identity_op>;
-    dim3 grid      = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
-    fusedL2NN<<<grid, blk, shmemSize, stream>>>(min,
-                                                x,
-                                                y,
-                                                xn,
-                                                yn,
-                                                m,
-                                                n,
-                                                k,
-                                                maxVal,
-                                                workspace,
-                                                redOp,
-                                                pairRedOp,
-                                                core_lambda,
-                                                raft::identity_op{});
-  }
 
+  using AccT = DataT;
+  ops::l2_exp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
+
+  raft::identity_op fin_op{};
+
+  auto kernel = fusedL2NNkernel<DataT,
+                                OutT,
+                                IdxT,
+                                P,
+                                ReduceOpT,
+                                KVPReduceOpT,
+                                decltype(distance_op),
+                                decltype(fin_op)>;
+
+  dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, kernel);
+
+  kernel<<<grid, blk, shmemSize, stream>>>(
+    min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, distance_op, fin_op);
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 0293f10c29..c6b09be31e 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -14,14 +14,11 @@
  * limitations under the License.
  */
 #pragma once
-#include <raft/core/operators.hpp>
-#include <raft/linalg/contractions.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/vectorized.cuh>
+#include <raft/linalg/contractions.cuh>       // raft::linalg::Contractions_NT
+#include <raft/util/cuda_dev_essentials.cuh>  // ceildiv
+#include <raft/util/cuda_rt_essentials.hpp>   // RAFT_CUDA_TRY
 
-#include <cstddef>
+#include <cstddef>  // size_t
 
 namespace raft {
 namespace distance {
@@ -29,16 +26,12 @@ namespace detail {
 
 /**
  * @brief Device class for L1, L2 and cosine distance metrics.
- * @tparam useNorms       whether norms are needed
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
  * @tparam OutT           output data-type (for C and D matrices)
  * @tparam IdxT           index data-type
  * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     tells how to accumulate an x and y into
-                          acc. its signature:
-    template <typename AccT, typename DataT> void core_lambda(AccT& acc,
-      const DataT& x, const DataT& y)
+ * @tparam OpT            A distance operation, e.g., cosine_distance_op.
  * @tparam EpilogueLambda applies an elementwise function to compute final
     values. Its signature is:
     template <typename AccT, typename DataT> void epilogue_lambda
@@ -56,19 +49,17 @@ namespace detail {
  * @param[in] yn row norms of input matrix B. Required for expanded L2, cosine
  * @param[output] pD output matrix
  * @param[in] smem shared mem buffer for intermediate storage of A, B, xn & yn.
- * @param core_op the core accumulation operation lambda
+ * @param distance_op the distance operation, e.g. cosine_distance_op
  * @param epilog_op the epilog operation lambda
  * @param fin_op the final gemm epilogue lambda
  * @param rowEpilog_op epilog lambda that executes when a full row has been processed
  */
 
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
+template <typename DataT,
           typename OutT,
           typename IdxT,
           typename Policy,
-          typename CoreLambda,
+          typename OpT,
           typename EpilogueLambda,
           typename FinalLambda,
           typename rowEpilogueLambda,
@@ -76,6 +67,9 @@ template <bool useNorms,
           bool writeOut      = true,
           typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
 struct PairwiseDistances : public BaseClass {
+  // Get accumulation type from distance_op
+  using AccT = typename OpT::AccT;
+
  private:
   typedef Policy P;
   const DataT* xn;
@@ -83,7 +77,7 @@ struct PairwiseDistances : public BaseClass {
   const DataT* const yBase;
   OutT* dOutput;
   char* smem;
-  CoreLambda core_op;
+  OpT distance_op;
   EpilogueLambda epilog_op;
   FinalLambda fin_op;
   rowEpilogueLambda rowEpilog_op;
@@ -109,7 +103,7 @@ struct PairwiseDistances : public BaseClass {
                        const DataT* _yn,
                        OutT* _dOutput,
                        char* _smem,
-                       CoreLambda _core_op,
+                       OpT _distance_op,
                        EpilogueLambda _epilog_op,
                        FinalLambda _fin_op,
                        rowEpilogueLambda _rowEpilog_op)
@@ -119,7 +113,7 @@ struct PairwiseDistances : public BaseClass {
       yBase(_y),
       dOutput(_dOutput),
       smem(_smem),
-      core_op(_core_op),
+      distance_op(_distance_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
       rowEpilog_op(_rowEpilog_op),
@@ -159,15 +153,25 @@ struct PairwiseDistances : public BaseClass {
         this->switch_read_buffer();
 
         // Epilog:
-        if (useNorms) {
+        if (distance_op.use_norms) {
           DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh];
           load_norms(tile_idx_m, tile_idx_n, regxn, regyn);
           // Overlap ldg with epilog computation
           ldgNextGridStride(tile_idx_m, tile_idx_n);
+          // Calculate distance_op epilog.
+          // Use .template to disambiguate (See:
+          // https://en.cppreference.com/w/cpp/language/dependent_name)
+          distance_op.template epilog<Policy>(acc, regxn, regyn, tile_idx_n, tile_idx_m);
+          // And any possible additional epilogs
           epilog_op(acc, regxn, regyn, tile_idx_n, tile_idx_m);
         } else {
           // Overlap ldg with epilog computation
           ldgNextGridStride(tile_idx_m, tile_idx_n);
+          // Calculate distance_op epilog.
+          // Use .template to disambiguate (See:
+          // https://en.cppreference.com/w/cpp/language/dependent_name)
+          distance_op.template epilog<Policy>(acc, nullptr, nullptr, tile_idx_n, tile_idx_m);
+          // And any possible additional epilogs
           epilog_op(acc, nullptr, nullptr, tile_idx_n, tile_idx_m);
         }
         if (writeOut) { store_output(tile_idx_m, tile_idx_n); }
@@ -201,24 +205,41 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void accumulate()
+  DI void accumulate_reg_tile(DataT (&reg_x)[P::AccRowsPerTh][P::Veclen],
+                              DataT (&reg_y)[P::AccColsPerTh][P::Veclen])
   {
 #pragma unroll
-    for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
-      this->ldsXY(ki);
+    for (int v = 0; v < P::Veclen; ++v) {
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
         for (int j = 0; j < P::AccColsPerTh; ++j) {
-#pragma unroll
-          for (int v = 0; v < P::Veclen; ++v) {
-            core_op(acc[i][j], this->regx[i][v], this->regy[j][v]);
-          }
+          distance_op.core(acc[i][j], reg_x[i][v], reg_y[j][v]);
         }
       }
     }
   }
 
+  DI void accumulate()
+  {
+    // We have a separate ldsXY and accumulate_reg_tile outside the loop body,
+    // so that these separated calls can be interspersed with preceding and
+    // following instructions, thereby hiding latency.
+    this->ldsXY(0);
+
+    // If expensive inner loop, do not unroll loop.
+    constexpr int num_iterations = P::Kblk / P::Veclen - 1;
+    constexpr int unroll_count   = decltype(distance_op)::expensive_inner_loop ? 1 : num_iterations;
+#pragma unroll unroll_count
+    for (int ki = P::Veclen; ki < P::Kblk; ki += P::Veclen) {
+      accumulate_reg_tile(this->regx, this->regy);
+      this->ldsXY(ki);
+    }
+
+    // Accumulate last loaded tile.
+    accumulate_reg_tile(this->regx, this->regy);
+  }
+
   DI void load_norms(IdxT tile_idx_m,
                      IdxT tile_idx_n,
                      DataT (&regxn)[P::AccRowsPerTh],
@@ -274,7 +295,11 @@ struct PairwiseDistances : public BaseClass {
 template <typename P, typename IdxT, typename T>
 dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
 {
-  const auto numSMs  = raft::getMultiProcessorCount();
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int numSMs;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, devId));
+
   int numBlocksPerSm = 0;
   dim3 grid;
 
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
index c5fdd28117..efcd5d9389 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -64,21 +64,20 @@ template <typename DataT,
           typename FinalLambda,
           typename OpT,
           bool isRowMajor>
-typename std::enable_if<ops::has_cutlass_op<OpT>::value>::type cutlassDistanceKernel(
-  const DataT* x,
-  const DataT* y,
-  const DataT* xn,
-  const DataT* yn,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  IdxT lda,
-  IdxT ldb,
-  IdxT ldd,
-  OutT* dOutput,
-  FinalLambda fin_op,
-  OpT distance_op,
-  cudaStream_t stream)
+std::enable_if_t<ops::has_cutlass_op<OpT>::value> cutlassDistanceKernel(const DataT* x,
+                                                                        const DataT* y,
+                                                                        const DataT* xn,
+                                                                        const DataT* yn,
+                                                                        IdxT m,
+                                                                        IdxT n,
+                                                                        IdxT k,
+                                                                        IdxT lda,
+                                                                        IdxT ldb,
+                                                                        IdxT ldd,
+                                                                        OutT* dOutput,
+                                                                        FinalLambda fin_op,
+                                                                        OpT distance_op,
+                                                                        cudaStream_t stream)
 {
   static_assert(!(std::is_same<OutT, bool>::value),
                 "OutType bool is not supported use uint8_t instead");
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 8524ce6fdf..e04b56ee8a 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -15,63 +15,74 @@
  */
 #pragma once
 
-#include <raft/distance/detail/distance_ops/cutlass.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_layout.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-#include <raft/distance/detail/pairwise_matrix/params.cuh>
-#include <raft/linalg/contractions.cuh>
-#include <raft/util/arch.cuh>
-#include <type_traits>
+/* This file has two responsibilities:
+ *
+ * 1. Dispatch to the correct implementation of a kernel based on the
+ *    architecture of the device on which the kernel will be launched. For
+ *    instance, the cosine distance has a CUTLASS-based implementation that can
+ *    be used on SM80+ and the normal implementation that is used on older
+ *    architectures.
+ *
+ * 2. Provide concise function templates that can be instantiated in
+ *    src/distance/distance/specializations/detail/. Previously,
+ *    raft::distance::detail::distance was instantiated. The function
+ *    necessarily required a large set of include files, which slowed down the
+ *    build. The raft::distance::detail::pairwise_matrix_arch_dispatch functions
+ *    do not require as large an include files set, which speeds up the build.
+ */
+
+#include <raft/distance/detail/distance_ops/cutlass.cuh>           // ops::has_cutlass_op
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>  // dispatch_sm60
+#include <raft/distance/detail/pairwise_matrix/params.cuh>         // pairwise_matrix_params
+#include <raft/util/arch.cuh>                                      // raft::util::arch::SM_*
+
+// NOTE: to minimize compile times, we do not include dispatch_sm80.cuh.
+// Including dispatch_sm80.cuh can slow down compile times (due to CUTLASS).
+// Therefore, it is the including file's responsibility to include the correct
+// dispatch_smXX.cuh headers, as is done in raft/distance/detail/distance.cuh
+// and the specializations in src/distance/distance/specializations/detail/.
 
 namespace raft::distance::detail {
 
+// This forward-declaration ensures that we do not need to include
+// dispatch_sm80.cuh if we are not calling it in practice. This makes compiling
+// all the non-CUTLASS based distance specializations faster. For CUTLASS-based
+// distances, dispatch_sm80.cuh has to be included by the file including this
+// file.
 template <typename OpT,
+          typename IdxT,
           typename DataT,
-          typename AccT,
           typename OutT,
           typename FinOpT,
-          typename IdxT = int>
-void pairwise_matrix_dispatch(OpT distance_op,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              const DataT* x,
-                              const DataT* y,
-                              const DataT* x_norm,
-                              const DataT* y_norm,
-                              OutT* out,
-                              FinOpT fin_op,
-                              cudaStream_t stream,
-                              bool is_row_major)
-{
-  // Create kernel parameter struct. Flip x and y if column major.
-  IdxT ldx    = is_row_major ? k : m;
-  IdxT ldy    = is_row_major ? k : n;
-  IdxT ld_out = is_row_major ? n : m;
-
-  pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params{
-    m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major};
-
-  if (!params.is_row_major) { params.flip_x_and_y(); }
+          typename SM_compat_t>
+void pairwise_matrix_sm80_dispatch(OpT,
+                                   pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>,
+                                   SM_compat_t,
+                                   cudaStream_t);
 
+template <typename OpT, typename IdxT, typename DataT, typename OutT, typename FinOpT>
+void pairwise_matrix_instantiation_point(OpT distance_op,
+                                         pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params,
+                                         cudaStream_t stream)
+{
   // On CUDA 12:
   // - always execute normal kernel
   //
   // On CUDA 11 and below:
   // - execute CUTLASS-based kernel on SM_80 and above
   // - execute normal kernel below SM_80
+  namespace arch = raft::util::arch;
 
   constexpr bool is_ctk_12              = __CUDACC_VER_MAJOR__ == 12;
   constexpr bool cutlass_op_unavailable = !ops::has_cutlass_op<OpT>();
 
   if constexpr (is_ctk_12 || cutlass_op_unavailable) {
     // Always execute legacy kernels on CUDA 12
-    auto any_range = raft::arch::SM_range(raft::arch::SM_min(), raft::arch::SM_future());
+    auto any_range = arch::SM_range(arch::SM_min(), arch::SM_future());
     pairwise_matrix_sm60_dispatch(distance_op, params, any_range, stream);
   } else {
-    auto cutlass_range = raft::arch::SM_range(raft::arch::SM_80(), raft::arch::SM_future());
-    auto legacy_range  = raft::arch::SM_range(raft::arch::SM_min(), raft::arch::SM_80());
+    auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future());
+    auto legacy_range  = arch::SM_range(arch::SM_min(), arch::SM_80());
 
     // Get pointer to SM60 kernel to determine the runtime architecture of the
     // current system. Other methods to determine the architecture (that do not
@@ -79,7 +90,7 @@ void pairwise_matrix_dispatch(OpT distance_op,
     // https://github.com/NVIDIA/cub/issues/545
     auto sm60_wrapper = pairwise_matrix_sm60_get_wrapper(distance_op, params, legacy_range);
     void* kernel_ptr  = reinterpret_cast<void*>(sm60_wrapper.kernel_ptr);
-    auto runtime_arch = raft::arch::kernel_runtime_arch(kernel_ptr);
+    auto runtime_arch = arch::kernel_runtime_arch(kernel_ptr);
 
     if (cutlass_range.contains(runtime_arch)) {
       // If device is SM_80 or later, use CUTLASS-based kernel.
@@ -92,4 +103,35 @@ void pairwise_matrix_dispatch(OpT distance_op,
   }
 }
 
+template <typename OpT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void pairwise_matrix_dispatch(OpT distance_op,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major)
+{
+  // Create kernel parameter struct. Flip x and y if column major.
+  IdxT ldx    = is_row_major ? k : m;
+  IdxT ldy    = is_row_major ? k : n;
+  IdxT ld_out = is_row_major ? n : m;
+
+  pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params{
+    m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major};
+
+  if (!params.is_row_major) { params.flip_x_and_y(); }
+  pairwise_matrix_instantiation_point(distance_op, params, stream);
+}
+
 };  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_layout.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_layout.cuh
index c1e4c08af4..f2b0e59822 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_layout.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_layout.cuh
@@ -15,10 +15,11 @@
  */
 #pragma once
 
-#include "kernel_sm60.cuh"
-#include <algorithm>
-#include <type_traits>
-
+#include <algorithm>                                        // std::min
+#include <cstdint>                                          // size_t
+#include <raft/core/error.hpp>                              // RAFT_EXPECTS
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+#include <type_traits>                                      // std::integral_constant
 namespace raft::distance::detail {
 
 /**
@@ -99,15 +100,15 @@ auto dispatch_layout(bool row_major, int vec_len, F&& f)
 {
   if (row_major) {
     switch (vec_len) {
-      case 4: return f(std::bool_constant<true>(), vec_len_constant<4>());
-      case 2: return f(std::bool_constant<true>(), vec_len_constant<2>());
-      default: return f(std::bool_constant<true>(), vec_len_constant<1>());
+      case 4: return f(std::true_type(), vec_len_constant<4>());
+      case 2: return f(std::true_type(), vec_len_constant<2>());
+      default: return f(std::true_type(), vec_len_constant<1>());
     }
   } else {
     switch (vec_len) {
-      case 4: return f(std::bool_constant<false>(), vec_len_constant<4>());
-      case 2: return f(std::bool_constant<false>(), vec_len_constant<2>());
-      default: return f(std::bool_constant<false>(), vec_len_constant<1>());
+      case 4: return f(std::false_type(), vec_len_constant<4>());
+      case 2: return f(std::false_type(), vec_len_constant<2>());
+      default: return f(std::false_type(), vec_len_constant<1>());
     }
   }
 }
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh
index 6e284007ea..2080fbe9cd 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh
@@ -15,10 +15,10 @@
  */
 #pragma once
 
-#include <algorithm>
-#include <raft/distance/detail/pairwise_matrix/dispatch_layout.cuh>
-#include <raft/distance/detail/pairwise_matrix/kernel_sm60.cuh>
-#include <raft/linalg/contractions.cuh>
+#include <algorithm>                                                 // std::min
+#include <raft/distance/detail/pairwise_matrix/dispatch_layout.cuh>  // dispatch_layout
+#include <raft/distance/detail/pairwise_matrix/kernel_sm60.cuh>      // pairwise_matrix_sm60_wrapper
+#include <raft/linalg/contractions.cuh>                              // raft::linalg::Policy4x4
 
 namespace raft::distance::detail {
 
@@ -35,7 +35,11 @@ pairwise_matrix_sm60_wrapper<OpT, IdxT, DataT, OutT, FinOpT> pairwise_matrix_sm6
 {
   int vec_len = determine_vec_len(params);
 
-  return dispatch_layout(params.is_row_major, vec_len, [&](auto row_major, auto vec_len_aligned) {
+  // f takes compile-time constants row_major and vec_len aligned and returns
+  // the corresponding kernel wrapper. The wrapper contains the launch
+  // parameters of the kernel: a pointer to the kernel function, grid size,
+  // block size, and shared memory size.
+  auto f = [&](auto row_major, auto vec_len_aligned) {
     // row_major and vec_len are std::integral_constants of type bool and int
     // respectively.
 
@@ -46,15 +50,19 @@ pairwise_matrix_sm60_wrapper<OpT, IdxT, DataT, OutT, FinOpT> pairwise_matrix_sm6
     // Prevent double, vec_len=4 combination (this is not supported)
     constexpr int vec_len = std::min(vec_len_op, static_cast<int>(16 / sizeof(DataT)));
 
-    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
-    typedef typename std::conditional<row_major(), RowPolicy, ColPolicy>::type Policy;
+    using RowPolicy = typename raft::linalg::Policy4x4<DataT, vec_len>::Policy;
+    using ColPolicy = typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy;
+    using Policy    = typename std::conditional<row_major(), RowPolicy, ColPolicy>::type;
 
     auto wrapper =
       make_pairwise_matrix_sm60_wrapper<Policy, row_major()>(distance_op, params, sm_compat_range);
 
     return wrapper;
-  });
+  };
+
+  // Dispatch_layout calls f with appropriate compile time constants based on
+  // the runtime values of params.is_row_major and vec_len.
+  return dispatch_layout(params.is_row_major, vec_len, f);
 }
 
 template <typename OpT,
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh
index ec2d522c25..dc30f2f239 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <algorithm>  // std::min
-#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_layout.cuh>
+#include <algorithm>                                                 // std::min
+#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>   // cutlassDistanceKernel
+#include <raft/distance/detail/pairwise_matrix/dispatch_layout.cuh>  // dispatch_layout
 
 namespace raft::distance::detail {
 
@@ -34,7 +34,9 @@ void pairwise_matrix_sm80_dispatch(OpT distance_op,
 {
   int vec_len = determine_vec_len(params);
 
-  dispatch_layout(params.is_row_major, vec_len, [&](auto row_major, auto vec_len_aligned) {
+  // f takes compile-time constants row_major and vec_len aligned and runs the
+  // corresponding cutlass launch code.
+  auto f = [&](auto row_major, auto vec_len_aligned) {
     // row_major and vec_len are std::integral_constants of type bool and int
     // respectively.
 
@@ -56,7 +58,11 @@ void pairwise_matrix_sm80_dispatch(OpT distance_op,
                                                                                       params.fin_op,
                                                                                       distance_op,
                                                                                       stream);
-  });
+  };
+
+  // Dispatch_layout calls f with appropriate compile time constants based on
+  // the runtime values of params.is_row_major and vec_len.
+  dispatch_layout(params.is_row_major, vec_len, f);
 }
 
 };  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index 6e3ab7b26b..2d0a98862e 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -15,11 +15,11 @@
  */
 #pragma once
 
-#include <cstddef>
-#include <raft/core/operators.hpp>
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/distance/detail/pairwise_matrix/params.cuh>
-#include <raft/util/arch.cuh>
+#include <cassert>                                          // assert
+#include <raft/core/operators.hpp>                          // raft::void_op
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+#include <raft/util/arch.cuh>                               // raft::util::arch::SM_compute_arch
 
 namespace raft::distance::detail {
 
@@ -36,43 +36,27 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
 {
   // Early exit to minimize the size of the kernel when it is not supposed to be compiled.
   constexpr SM_compat_t sm_compat_range{};
-  if constexpr (!sm_compat_range.contains(raft::arch::SM_compute_arch())) {
+  if constexpr (!sm_compat_range.contains(raft::util::arch::SM_compute_arch())) {
     assert(false);
     return;
   }
 
   extern __shared__ char smem[];
 
-  using AccT = typename OpT::AccT;
-
-  // Wrap operator back into lambdas. This is temporary and should be removed.
-  // See: https://github.com/rapidsai/raft/issues/1323
-  auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
-    distance_op.core(acc, x, y);
-  };
-  auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                                            DataT * regxn,
-                                            DataT * regyn,
-                                            IdxT gridStrideX,
-                                            IdxT gridStrideY) {
-    // Use .template to disambiguate (See:
-    // https://en.cppreference.com/w/cpp/language/dependent_name)
-    distance_op.template epilog<Policy>(acc, regxn, regyn, gridStrideX, gridStrideY);
-  };
-
+  // The epilog is already provided by distance_op. Do not provide additional
+  // epilogs.
+  auto epilog_op = raft::void_op();
   // No support for row_epilog_op.
   auto row_epilog_op = raft::void_op();
 
   // Always write output
   constexpr bool write_out = true;
   constexpr bool use_norms = distance_op.use_norms;
-  PairwiseDistances<use_norms,
-                    DataT,
-                    AccT,
+  PairwiseDistances<DataT,
                     OutT,
                     IdxT,
                     Policy,
-                    decltype(core_op),
+                    decltype(distance_op),
                     decltype(epilog_op),
                     decltype(params.fin_op),
                     decltype(row_epilog_op),
@@ -90,38 +74,13 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
         params.y_norm,
         params.out,
         smem,
-        core_op,
+        distance_op,
         epilog_op,
         params.fin_op,
         row_epilog_op);
   obj.run();
 }
 
-template <typename Policy,
-          bool row_major,
-          typename SM_compat_t,
-          typename OpT,
-          typename IdxT,
-          typename DataT,
-          typename OutT,
-          typename FinOpT>
-void pairwise_matrix(OpT distance_op,
-                     pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params,
-                     cudaStream_t stream)
-{
-  dim3 blk(Policy::Nthreads);
-  // Use .template to disambiguate (See:
-  // https://en.cppreference.com/w/cpp/language/dependent_name)
-  size_t smem_size = distance_op.template shared_mem_size<Policy>();
-  // Obtain function pointer to kernel
-  auto kernel =
-    pairwise_matrix_kernel<Policy, row_major, SM_compat_t, OpT, IdxT, DataT, OutT, FinOpT>;
-  dim3 grid = launchConfigGenerator<Policy>(params.m, params.n, smem_size, kernel);
-
-  kernel<<<grid, blk, smem_size, stream>>>(distance_op, params);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
 // The type of a pointer to the pairwise matrix kernel. The following template
 // arguments are type-erased:
 //
@@ -181,9 +140,9 @@ pairwise_matrix_sm60_wrapper<OpT, IdxT, DataT, OutT, FinOpT> make_pairwise_matri
   SM_compat_t sm_compat_range)
 {
   dim3 block(Policy::Nthreads);
-  // Use .template to disambiguate (See:
+  // Use ::template to disambiguate (See:
   // https://en.cppreference.com/w/cpp/language/dependent_name)
-  int smem_size = distance_op.template shared_mem_size<Policy>();
+  int smem_size = OpT::template shared_mem_size<Policy>();
   // Obtain function pointer to kernel
   auto kernel =
     pairwise_matrix_kernel<Policy, row_major, SM_compat_t, OpT, IdxT, DataT, OutT, FinOpT>;
diff --git a/cpp/include/raft/distance/specializations/detail/00_write_template.py b/cpp/include/raft/distance/specializations/detail/00_write_template.py
new file mode 100644
index 0000000000..63ae6580b4
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/00_write_template.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+
+# This template manages all files in this directory, apart from
+# inner_product.cuh and kernels.cuh.
+
+
+# NOTE: this template is not perfectly formatted. Use pre-commit to get
+# everything in shape again.
+start_template = """/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft::distance::detail {
+
+"""
+
+extern_template = """
+extern template void pairwise_matrix_instantiation_point<OpT,
+                                                         IdxT,
+                                                         DataT,
+                                                         OutT,
+                                                         FinopT>(
+  OpT,
+  pairwise_matrix_params<IdxT, DataT, OutT, FinopT>,
+  cudaStream_t);
+"""
+
+end_template = """}  // namespace raft::distance::detail
+"""
+
+data_type_instances = [
+    dict(
+        DataT="float",
+        AccT="float",
+        OutT="float",
+        IdxT="int",
+    ),
+    dict(
+        DataT="double",
+        AccT="double",
+        OutT="double",
+        IdxT="int",
+    ),
+]
+
+
+
+
+op_instances = [
+    dict(
+        path_prefix="canberra",
+        OpT="ops::canberra_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="correlation",
+        OpT="ops::correlation_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="cosine",
+        OpT="ops::cosine_distance_op<DataT, AccT, IdxT>",
+        # cosine uses CUTLASS for SM80+
+    ),
+    dict(
+        path_prefix="hamming_unexpanded",
+        OpT="ops::hamming_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="hellinger_expanded",
+        OpT="ops::hellinger_distance_op<DataT, AccT, IdxT>",
+    ),
+    # inner product is handled by cublas.
+    dict(
+        path_prefix="jensen_shannon",
+        OpT="ops::jensen_shannon_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="kl_divergence",
+        OpT="ops::kl_divergence_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="l1",
+        OpT="ops::l1_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="l2_expanded",
+        OpT="ops::l2_exp_distance_op<DataT, AccT, IdxT>",
+        # L2 expanded uses CUTLASS for SM80+
+    ),
+    dict(
+        path_prefix="l2_unexpanded",
+        OpT="ops::l2_unexp_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="l_inf",
+        OpT="ops::l_inf_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="lp_unexpanded",
+        OpT="ops::lp_unexp_distance_op<DataT, AccT, IdxT>",
+    ),
+    dict(
+        path_prefix="russel_rao",
+        OpT="ops::russel_rao_distance_op<DataT, AccT, IdxT>",
+    ),
+]
+
+def fill_in(s, template):
+    for k, v in template.items():
+        s = s.replace(k, v)
+    return s
+
+for op_instance in op_instances:
+    path = fill_in("path_prefix.cuh", op_instance)
+    with open(path, "w") as f:
+        f.write(start_template)
+
+        for data_type_instance in data_type_instances:
+            op_data_instance = {
+                k : fill_in(v, data_type_instance)
+                for k, v in op_instance.items()
+            }
+            instance = {
+                **op_data_instance,
+                **data_type_instance,
+                "FinopT": "raft::identity_op",
+            }
+
+            text = fill_in(extern_template, instance)
+
+            f.write(text)
+
+        f.write(end_template)
diff --git a/cpp/include/raft/distance/specializations/detail/canberra.cuh b/cpp/include/raft/distance/specializations/detail/canberra.cuh
index badce715a5..276c85e5f6 100644
--- a/cpp/include/raft/distance/specializations/detail/canberra.cuh
+++ b/cpp/include/raft/distance/specializations/detail/canberra.cuh
@@ -16,37 +16,25 @@
 
 #pragma once
 
-#include <cstdint>
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::Canberra, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void distance<raft::distance::DistanceType::Canberra, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::canberra_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::canberra_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
+
+extern template void pairwise_matrix_instantiation_point<
+  ops::canberra_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::canberra_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/correlation.cuh b/cpp/include/raft/distance/specializations/detail/correlation.cuh
index 013a0d43a3..f019f678df 100644
--- a/cpp/include/raft/distance/specializations/detail/correlation.cuh
+++ b/cpp/include/raft/distance/specializations/detail/correlation.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::CorrelationExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::correlation_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::correlation_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
+
+extern template void pairwise_matrix_instantiation_point<
+  ops::correlation_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::correlation_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/cosine.cuh b/cpp/include/raft/distance/specializations/detail/cosine.cuh
index c88bd1b0f6..dcde4ec286 100644
--- a/cpp/include/raft/distance/specializations/detail/cosine.cuh
+++ b/cpp/include/raft/distance/specializations/detail/cosine.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::CosineExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::CosineExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<float, float, int>,
+                                                         int,
+                                                         float,
+                                                         float,
+                                                         raft::identity_op>(
+  ops::cosine_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, raft::identity_op>,
+  cudaStream_t);
+
+extern template void pairwise_matrix_instantiation_point<
+  ops::cosine_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::cosine_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
index 3c5cad3315..1d6964fbce 100644
--- a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
+++ b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::HammingUnexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::hamming_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::hamming_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
+
+extern template void pairwise_matrix_instantiation_point<
+  ops::hamming_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::hamming_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
index bf214c046f..f96a06f919 100644
--- a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
+++ b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
@@ -18,37 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::HellingerExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<
+  ops::hellinger_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::hellinger_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::hellinger_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::hellinger_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
index 145834fb70..0b58646582 100644
--- a/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
@@ -18,37 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::JensenShannon, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::JensenShannon, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<
+  ops::jensen_shannon_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::jensen_shannon_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::jensen_shannon_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::jensen_shannon_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
index f0928916cd..5c164e0fd4 100644
--- a/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<float, float, int>,
+                                                         int,
+                                                         float,
+                                                         float,
+                                                         raft::identity_op>(
+  ops::kl_divergence_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, raft::identity_op>,
+  cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<double, double, int>,
+                                                         int,
+                                                         double,
+                                                         double,
+                                                         raft::identity_op>(
+  ops::kl_divergence_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, raft::identity_op>,
+  cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l1.cuh b/cpp/include/raft/distance/specializations/detail/l1.cuh
index 23261a2571..870627d909 100644
--- a/cpp/include/raft/distance/specializations/detail/l1.cuh
+++ b/cpp/include/raft/distance/specializations/detail/l1.cuh
@@ -18,35 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::L1, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void distance<raft::distance::DistanceType::L1, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<ops::l1_distance_op<float, float, int>,
+                                                         int,
+                                                         float,
+                                                         float,
+                                                         raft::identity_op>(
+  ops::l1_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, raft::identity_op>,
+  cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<ops::l1_distance_op<double, double, int>,
+                                                         int,
+                                                         double,
+                                                         double,
+                                                         raft::identity_op>(
+  ops::l1_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, raft::identity_op>,
+  cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
index f953018b7d..ee3207bcce 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
+++ b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<float, float, int>,
+                                                         int,
+                                                         float,
+                                                         float,
+                                                         raft::identity_op>(
+  ops::l2_exp_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, raft::identity_op>,
+  cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::l2_exp_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::l2_exp_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
deleted file mode 100644
index 9f5f6a3706..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-extern template void
-distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
deleted file mode 100644
index 94531ddc33..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-extern template void
-distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
index 224b21fce8..1fbf57632b 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
+++ b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<
+  ops::l2_unexp_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::l2_unexp_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::l2_unexp_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::l2_unexp_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l_inf.cuh b/cpp/include/raft/distance/specializations/detail/l_inf.cuh
index 9a46d7b488..388d3bf439 100644
--- a/cpp/include/raft/distance/specializations/detail/l_inf.cuh
+++ b/cpp/include/raft/distance/specializations/detail/l_inf.cuh
@@ -18,35 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::Linf, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void distance<raft::distance::DistanceType::Linf, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<float, float, int>,
+                                                         int,
+                                                         float,
+                                                         float,
+                                                         raft::identity_op>(
+  ops::l_inf_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, raft::identity_op>,
+  cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::l_inf_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::l_inf_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
index e05ef02c42..d8e86ce6f2 100644
--- a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
+++ b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
@@ -18,36 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<
+  ops::lp_unexp_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::lp_unexp_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::lp_unexp_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::lp_unexp_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh b/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
index afc87997c0..4803fb8ab0 100644
--- a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
+++ b/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
@@ -18,37 +18,23 @@
 
 #include <raft/distance/detail/distance.cuh>
 
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void
-distance<raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
+namespace raft::distance::detail {
 
-extern template void
-distance<raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
+extern template void pairwise_matrix_instantiation_point<
+  ops::russel_rao_distance_op<float, float, int>,
+  int,
+  float,
+  float,
+  raft::identity_op>(ops::russel_rao_distance_op<float, float, int>,
+                     pairwise_matrix_params<int, float, float, raft::identity_op>,
+                     cudaStream_t);
 
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+extern template void pairwise_matrix_instantiation_point<
+  ops::russel_rao_distance_op<double, double, int>,
+  int,
+  double,
+  double,
+  raft::identity_op>(ops::russel_rao_distance_op<double, double, int>,
+                     pairwise_matrix_params<int, double, double, raft::identity_op>,
+                     cudaStream_t);
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index 8daa398b49..a34f696e9e 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -27,8 +27,6 @@
 #include <raft/distance/specializations/detail/kl_divergence.cuh>
 #include <raft/distance/specializations/detail/l1.cuh>
 #include <raft/distance/specializations/detail/l2_expanded.cuh>
-#include <raft/distance/specializations/detail/l2_sqrt_expanded.cuh>
-#include <raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh>
 #include <raft/distance/specializations/detail/l2_unexpanded.cuh>
 #include <raft/distance/specializations/detail/l_inf.cuh>
 #include <raft/distance/specializations/detail/lp_unexpanded.cuh>
diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 8c0cfeba28..73d93ab535 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -958,7 +958,7 @@ int computeSmallestEigenvectors(
                                 (*effIter) * nEigVecs * sizeof(value_type_t),
                                 cudaMemcpyHostToDevice,
                                 stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
@@ -1305,7 +1305,7 @@ int computeLargestEigenvectors(
                                 cudaMemcpyHostToDevice,
                                 stream));
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
diff --git a/cpp/include/raft/matrix/detail/select_k.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
index ac1ba3dfa3..20c2fb119d 100644
--- a/cpp/include/raft/matrix/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -84,7 +84,7 @@ void select_k(const T* in_val,
       in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   } else {
     select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
-      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
   }
 }
 
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 643a63d9db..7ac40ac0eb 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <optional>
-
-#include <raft/core/cudart_utils.hpp>
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
@@ -35,8 +35,8 @@
 #include <rmm/mr/device/managed_memory_resource.hpp>
 
 namespace raft::matrix::detail::select::radix {
+namespace impl {
 
-constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
 
 template <int BitsPerPass>
@@ -51,13 +51,6 @@ _RAFT_HOST_DEVICE constexpr int calc_num_passes()
   return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
 }
 
-// Minimum reasonable block size for the given radix size.
-template <int BitsPerPass>
-_RAFT_HOST_DEVICE constexpr int calc_min_block_size()
-{
-  return 1 << std::max<int>(BitsPerPass - 4, Pow2<WarpSize>::Log2 + 1);
-}
-
 /**
  * Bit 0 is the least significant (rightmost);
  * this implementation processes input from the most to the least significant bit.
@@ -82,23 +75,43 @@ _RAFT_DEVICE constexpr unsigned calc_mask(int pass)
 }
 
 /**
- * Use cub to twiddle bits - so that we can correctly compare bits of floating-point values as well
+ * Use CUB to twiddle bits - so that we can correctly compare bits of floating-point values as well
  * as of integers.
  */
 template <typename T>
-_RAFT_DEVICE typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+_RAFT_DEVICE typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool select_min)
 {
   auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
   bits      = cub::Traits<T>::TwiddleIn(bits);
-  if (greater) { bits = ~bits; }
+  if (!select_min) { bits = ~bits; }
   return bits;
 }
 
+template <typename T>
+_RAFT_DEVICE T twiddle_out(typename cub::Traits<T>::UnsignedBits bits, bool select_min)
+{
+  if (!select_min) { bits = ~bits; }
+  bits = cub::Traits<T>::TwiddleOut(bits);
+  return reinterpret_cast<T&>(bits);
+}
+
 template <typename T, int BitsPerPass>
-_RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+_RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool select_min)
+{
+  static_assert(BitsPerPass <= sizeof(int) * 8 - 1,
+                "BitsPerPass is too large that the result type could not be int");
+  return (twiddle_in(x, select_min) >> start_bit) & mask;
+}
+
+template <typename T, typename IdxT>
+_RAFT_HOST_DEVICE IdxT calc_buf_len(IdxT len)
 {
-  static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
-  return (twiddle_in(x, greater) >> start_bit) & mask;
+  // When writing is skipped, only read `in`(type T).
+  // When writing is not skipped, read `in_buf`(T) and `in_idx_buf`(IdxT), and write `out_buf`(T)
+  // and `out_idx_buf`(IdxT).
+  // The ratio between these cases determines whether to skip writing and hence the buffer size.
+  constexpr float ratio = 2 + sizeof(IdxT) * 2.0 / sizeof(T);
+  return len / ratio;
 }
 
 /**
@@ -111,17 +124,18 @@ _RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
  * @tparam IdxT indexing type
  * @tparam Func void (T x, IdxT idx)
  *
+ * @param thread_rank rank of the calling thread among all participating threads
+ * @param num_threads number of the threads that participate in processing
  * @param in the input data
  * @param len the number of elements to read
  * @param f the lambda taking two arguments (T x, IdxT idx)
  */
 template <typename T, typename IdxT, typename Func>
-_RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f)
+_RAFT_DEVICE void vectorized_process(
+  size_t thread_rank, size_t num_threads, const T* in, IdxT len, Func f)
 {
-  const IdxT stride = blockDim.x * gridDim.x;
-  const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
   if constexpr (sizeof(T) >= VECTORIZED_READ_SIZE || VECTORIZED_READ_SIZE % sizeof(T) != 0) {
-    for (IdxT i = tid; i < len; i += stride) {
+    for (IdxT i = thread_rank; i < len; i += num_threads) {
       f(in[i], i);
     }
   } else {
@@ -134,8 +148,8 @@ _RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f)
     const IdxT skip_cnt_left = std::min<IdxT>((IdxT)(align_bytes::roundUp(in) - in), len);
 
     // The main loop: process all aligned data
-    for (IdxT i = tid * wide_t::Ratio + skip_cnt_left; i + wide_t::Ratio <= len;
-         i += stride * wide_t::Ratio) {
+    for (IdxT i = thread_rank * wide_t::Ratio + skip_cnt_left; i + wide_t::Ratio <= len;
+         i += num_threads * wide_t::Ratio) {
       wide.load(in, i);
 #pragma unroll
       for (int j = 0; j < wide_t::Ratio; ++j) {
@@ -145,30 +159,55 @@ _RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f)
 
     static_assert(WarpSize >= wide_t::Ratio);
     // Processes the skipped elements on the left
-    if (tid < skip_cnt_left) { f(in[tid], tid); }
+    if (thread_rank < skip_cnt_left) { f(in[thread_rank], thread_rank); }
     // Processes the skipped elements on the right
     const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
-    const IdxT remain_i       = len - skip_cnt_right + tid;
+    const IdxT remain_i       = len - skip_cnt_right + thread_rank;
     if (remain_i < len) { f(in[remain_i], remain_i); }
   }
 }
 
 template <typename T, typename IdxT>
-struct Counter {
+struct alignas(128) Counter {
+  // We are processing the values in multiple passes, from most significant to least significant. In
+  // each pass, we keep the length of input (`len`) and the `k` of current pass, and update them at
+  // the end of the pass.
   IdxT k;
   IdxT len;
+
+  //  `previous_len` is the length of input in previous pass. Note that `previous_len` rather
+  //  than `len` is used for the filtering step because filtering is indeed for previous pass (see
+  //  comments before `radix_kernel`).
   IdxT previous_len;
-  int bucket;
 
-  IdxT filter_cnt;
-  unsigned int finished_block_cnt;
-  IdxT out_cnt;
-  IdxT out_back_cnt;
+  // We determine the bits of the k_th value inside the mask processed by the pass. The
+  // already known bits are stored in `kth_value_bits`. It's used to discriminate a element is a
+  // result (written to `out`), a candidate for next pass (written to `out_buf`), or not useful
+  // (discarded). The bits that are not yet processed do not matter for this purpose.
+  typename cub::Traits<T>::UnsignedBits kth_value_bits;
+
+  // Record how many elements have passed filtering. It's used to determine the position in the
+  // `out_buf` where an element should be written.
+  alignas(128) IdxT filter_cnt;
+
+  // For a row inside a batch, we may launch multiple thread blocks. This counter is used to
+  // determine if the current block is the last running block. If so, this block will execute scan()
+  // and choose_bucket().
+  alignas(128) unsigned int finished_block_cnt;
+
+  // Record how many elements have been written to the front of `out`. Elements less (if
+  // select_min==true) than the k-th value are written from front to back.
+  alignas(128) IdxT out_cnt;
+
+  // Record how many elements have been written to the back of `out`. Elements equal to the k-th
+  // value are written from back to front. We need to keep count of them separately because the
+  // number of elements that <= the k-th value might exceed k.
+  alignas(128) IdxT out_back_cnt;
 };
 
 /**
- * Fused filtering of the current phase and building histogram for the next phase
- * (see steps 4-1 in `radix_kernel` description).
+ * Fused filtering of the current pass and building histogram for the next pass
+ * (see steps 4 & 1 in `radix_kernel` description).
  */
 template <typename T, typename IdxT, int BitsPerPass>
 _RAFT_DEVICE void filter_and_histogram(const T* in_buf,
@@ -177,12 +216,12 @@ _RAFT_DEVICE void filter_and_histogram(const T* in_buf,
                                        IdxT* out_idx_buf,
                                        T* out,
                                        IdxT* out_idx,
-                                       IdxT len,
+                                       IdxT previous_len,
                                        Counter<T, IdxT>* counter,
                                        IdxT* histogram,
-                                       bool greater,
+                                       bool select_min,
                                        int pass,
-                                       int k)
+                                       bool early_stop)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ IdxT histogram_smem[num_buckets];
@@ -198,19 +237,20 @@ _RAFT_DEVICE void filter_and_histogram(const T* in_buf,
     // Passed to vectorized_process, this function executes in all blocks in parallel,
     // i.e. the work is split along the input (both, in batches and chunks of a single row).
     // Later, the histograms are merged using atomicAdd.
-    auto f = [greater, start_bit, mask](T value, IdxT) {
-      int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
-      atomicAdd(histogram_smem + bucket, IdxT(1));
+    auto f = [select_min, start_bit, mask](T value, IdxT) {
+      int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, select_min);
+      atomicAdd(histogram_smem + bucket, static_cast<IdxT>(1));
     };
-    vectorized_process(in_buf, len, f);
+    vectorized_process(static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x,
+                       static_cast<size_t>(blockDim.x) * gridDim.x,
+                       in_buf,
+                       previous_len,
+                       f);
   } else {
-    const IdxT previous_len      = counter->previous_len;
-    const int want_bucket        = counter->bucket;
-    IdxT& filter_cnt             = counter->filter_cnt;
-    IdxT& out_cnt                = counter->out_cnt;
-    const IdxT counter_len       = counter->len;
+    IdxT* p_filter_cnt           = &counter->filter_cnt;
+    IdxT* p_out_cnt              = &counter->out_cnt;
+    const auto kth_value_bits    = counter->kth_value_bits;
     const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
-    const unsigned previous_mask = calc_mask<T, BitsPerPass>(pass - 1);
 
     // See the remark above on the distributed execution of `f` using vectorized_process.
     auto f = [in_idx_buf,
@@ -218,38 +258,50 @@ _RAFT_DEVICE void filter_and_histogram(const T* in_buf,
               out_idx_buf,
               out,
               out_idx,
-              greater,
-              k,
+              select_min,
               start_bit,
               mask,
               previous_start_bit,
-              previous_mask,
-              want_bucket,
-              &filter_cnt,
-              &out_cnt,
-              counter_len](T value, IdxT i) {
-      int prev_bucket =
-        calc_bucket<T, BitsPerPass>(value, previous_start_bit, previous_mask, greater);
-      if (prev_bucket == want_bucket) {
-        IdxT pos     = atomicAdd(&filter_cnt, IdxT(1));
-        out_buf[pos] = value;
-        if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
-        int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
-        atomicAdd(histogram_smem + bucket, IdxT(1));
-
-        if (counter_len == 1) {
-          out[k - 1]     = value;
-          out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
+              kth_value_bits,
+              p_filter_cnt,
+              p_out_cnt,
+              early_stop](T value, IdxT i) {
+      const auto previous_bits = (twiddle_in(value, select_min) >> previous_start_bit)
+                                 << previous_start_bit;
+      if (previous_bits == kth_value_bits) {
+        if (early_stop) {
+          IdxT pos     = atomicAdd(p_out_cnt, static_cast<IdxT>(1));
+          out[pos]     = value;
+          out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+        } else {
+          if (out_buf) {
+            IdxT pos         = atomicAdd(p_filter_cnt, static_cast<IdxT>(1));
+            out_buf[pos]     = value;
+            out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i;
+          }
+
+          int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, select_min);
+          atomicAdd(histogram_smem + bucket, static_cast<IdxT>(1));
         }
-      } else if (prev_bucket < want_bucket) {
-        IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+      }
+      // the condition `(out_buf || early_stop)` is a little tricky:
+      // If we skip writing to `out_buf` (when `out_buf` is nullptr), we should skip writing to
+      // `out` too. So we won't write the same value to `out` multiple times in different passes.
+      // And if we keep skipping the writing, values will be written in `last_filter_kernel()` at
+      // last. But when `early_stop` is true, we need to write to `out` since it's the last chance.
+      else if ((out_buf || early_stop) && previous_bits < kth_value_bits) {
+        IdxT pos     = atomicAdd(p_out_cnt, static_cast<IdxT>(1));
         out[pos]     = value;
         out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
       }
     };
-
-    vectorized_process(in_buf, previous_len, f);
+    vectorized_process(static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x,
+                       static_cast<size_t>(blockDim.x) * gridDim.x,
+                       in_buf,
+                       previous_len,
+                       f);
   }
+  if (early_stop) { return; }
   __syncthreads();
 
   // merge histograms produced by individual blocks
@@ -259,69 +311,184 @@ _RAFT_DEVICE void filter_and_histogram(const T* in_buf,
 }
 
 /**
- * Replace a part of the histogram with its own prefix sum, starting from the `start` and adding
- * `current` to each entry of the result.
+ * Replace histogram with its own prefix sum
  * (step 2 in `radix_kernel` description)
  */
 template <typename IdxT, int BitsPerPass, int BlockSize>
-_RAFT_DEVICE void scan(volatile IdxT* histogram,
-                       const int start,
-                       const int num_buckets,
-                       const IdxT current)
+_RAFT_DEVICE void scan(volatile IdxT* histogram)
 {
-  typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
-  __shared__ typename BlockScan::TempStorage temp_storage;
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  if constexpr (num_buckets >= BlockSize) {
+    static_assert(num_buckets % BlockSize == 0);
+    constexpr int items_per_thread = num_buckets / BlockSize;
+    typedef cub::BlockLoad<IdxT, BlockSize, items_per_thread, cub::BLOCK_LOAD_TRANSPOSE> BlockLoad;
+    typedef cub::BlockStore<IdxT, BlockSize, items_per_thread, cub::BLOCK_STORE_TRANSPOSE>
+      BlockStore;
+    typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
 
-  IdxT thread_data = 0;
-  int index        = start + threadIdx.x;
-  if (index < num_buckets) { thread_data = histogram[index]; }
+    __shared__ union {
+      typename BlockLoad::TempStorage load;
+      typename BlockScan::TempStorage scan;
+      typename BlockStore::TempStorage store;
+    } temp_storage;
+    IdxT thread_data[items_per_thread];
 
-  BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-  __syncthreads();
-  if (index < num_buckets) { histogram[index] = thread_data + current; }
-  __syncthreads();  // This sync is necessary, as the content of histogram needs
-                    // to be read after
+    BlockLoad(temp_storage.load).Load(histogram, thread_data);
+    __syncthreads();
+
+    BlockScan(temp_storage.scan).InclusiveSum(thread_data, thread_data);
+    __syncthreads();
+
+    BlockStore(temp_storage.store).Store(histogram, thread_data);
+  } else {
+    typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+
+    IdxT thread_data = 0;
+    if (threadIdx.x < num_buckets) { thread_data = histogram[threadIdx.x]; }
+
+    BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+    __syncthreads();
+
+    if (threadIdx.x < num_buckets) { histogram[threadIdx.x] = thread_data; }
+  }
 }
 
 /**
  * Calculate in which bucket the k-th value will fall
- *  (steps 2-3 in `radix_kernel` description)
+ *  (steps 3 in `radix_kernel` description)
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-_RAFT_DEVICE void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
+template <typename T, typename IdxT, int BitsPerPass>
+_RAFT_DEVICE void choose_bucket(Counter<T, IdxT>* counter,
+                                const IdxT* histogram,
+                                const IdxT k,
+                                const int pass)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
-  int index                 = threadIdx.x;
-  IdxT last_prefix_sum      = 0;
-  int num_pass              = 1;
-  if constexpr (num_buckets >= BlockSize) {
-    static_assert(num_buckets % BlockSize == 0);
-    num_pass = num_buckets / BlockSize;
+  for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    IdxT prev = (i == 0) ? 0 : histogram[i - 1];
+    IdxT cur  = histogram[i];
+
+    // one and only one thread will satisfy this condition, so counter is written by only one thread
+    if (prev < k && cur >= k) {
+      counter->k   = k - prev;    // how many values still are there to find
+      counter->len = cur - prev;  // number of values in next pass
+      typename cub::Traits<T>::UnsignedBits bucket = i;
+      int start_bit                                = calc_start_bit<T, BitsPerPass>(pass);
+      counter->kth_value_bits |= bucket << start_bit;
+    }
   }
+}
 
-  for (int i = 0; i < num_pass && (last_prefix_sum < k); i++) {
-    // Turn the i-th chunk of the histogram into its prefix sum.
-    scan<IdxT, BitsPerPass, BlockSize>(histogram, i * BlockSize, num_buckets, last_prefix_sum);
-    if (index < num_buckets) {
-      // Number of values in the previous `index-1` buckets (see the `scan` op above)
-      IdxT prev = (index == 0) ? 0 : histogram[index - 1];
-      // Number of values in `index` buckets
-      IdxT cur = histogram[index];
-
-      // one and only one thread will satisfy this condition, so only write once
-      if (prev < k && cur >= k) {
-        counter->k            = k - prev;  // how many values still are there to find
-        counter->previous_len = counter->len;
-        counter->len          = cur - prev;  // number of values in `index` bucket
-        counter->bucket       = index;
+// For one-block version, last_filter() could be called when pass < num_passes - 1.
+// So `pass` could not be constexpr
+template <typename T, typename IdxT, int BitsPerPass>
+_RAFT_DEVICE void last_filter(const T* in_buf,
+                              const IdxT* in_idx_buf,
+                              T* out,
+                              IdxT* out_idx,
+                              IdxT current_len,
+                              IdxT k,
+                              Counter<T, IdxT>* counter,
+                              const bool select_min,
+                              const int pass)
+{
+  const auto kth_value_bits = counter->kth_value_bits;
+  const int start_bit       = calc_start_bit<T, BitsPerPass>(pass);
+
+  // changed in choose_bucket(); need to reload
+  const IdxT needed_num_of_kth = counter->k;
+  IdxT* p_out_cnt              = &counter->out_cnt;
+  IdxT* p_out_back_cnt         = &counter->out_back_cnt;
+  for (IdxT i = threadIdx.x; i < current_len; i += blockDim.x) {
+    const T value   = in_buf[i];
+    const auto bits = (twiddle_in(value, select_min) >> start_bit) << start_bit;
+    if (bits < kth_value_bits) {
+      IdxT pos = atomicAdd(p_out_cnt, static_cast<IdxT>(1));
+      out[pos] = value;
+      // For one-block version, `in_idx_buf` could be nullptr at pass 0.
+      // For non one-block version, if writing has been skipped, `in_idx_buf` could be nullptr if
+      // `in_buf` is `in`
+      out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+    } else if (bits == kth_value_bits) {
+      IdxT back_pos = atomicAdd(p_out_back_cnt, static_cast<IdxT>(1));
+      if (back_pos < needed_num_of_kth) {
+        IdxT pos     = k - 1 - back_pos;
+        out[pos]     = value;
+        out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
       }
     }
-    index += BlockSize;
-    // this will break the loop when the counter is set (cur >= k), because last_prefix_sum >= cur
-    last_prefix_sum = histogram[(i + 1) * BlockSize - 1];
   }
 }
 
+template <typename T, typename IdxT, int BitsPerPass>
+__global__ void last_filter_kernel(const T* in,
+                                   const IdxT* in_idx,
+                                   const T* in_buf,
+                                   const IdxT* in_idx_buf,
+                                   T* out,
+                                   IdxT* out_idx,
+                                   IdxT len,
+                                   IdxT k,
+                                   Counter<T, IdxT>* counters,
+                                   const bool select_min)
+{
+  const size_t batch_id = blockIdx.y;  // size_t to avoid multiplication overflow
+
+  Counter<T, IdxT>* counter = counters + batch_id;
+  IdxT previous_len         = counter->previous_len;
+  if (previous_len == 0) { return; }
+  const IdxT buf_len = calc_buf_len<T>(len);
+  if (previous_len > buf_len || in_buf == in) {
+    in_buf       = in + batch_id * len;
+    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
+    previous_len = len;
+  } else {
+    in_buf += batch_id * buf_len;
+    in_idx_buf += batch_id * buf_len;
+  }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+
+  constexpr int pass      = calc_num_passes<T, BitsPerPass>() - 1;
+  constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
+
+  const auto kth_value_bits    = counter->kth_value_bits;
+  const IdxT needed_num_of_kth = counter->k;
+  IdxT* p_out_cnt              = &counter->out_cnt;
+  IdxT* p_out_back_cnt         = &counter->out_back_cnt;
+
+  auto f = [k,
+            select_min,
+            kth_value_bits,
+            needed_num_of_kth,
+            p_out_cnt,
+            p_out_back_cnt,
+            in_idx_buf,
+            out,
+            out_idx](T value, IdxT i) {
+    const auto bits = (twiddle_in(value, select_min) >> start_bit) << start_bit;
+    if (bits < kth_value_bits) {
+      IdxT pos     = atomicAdd(p_out_cnt, static_cast<IdxT>(1));
+      out[pos]     = value;
+      out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+    } else if (bits == kth_value_bits) {
+      IdxT back_pos = atomicAdd(p_out_back_cnt, static_cast<IdxT>(1));
+      if (back_pos < needed_num_of_kth) {
+        IdxT pos     = k - 1 - back_pos;
+        out[pos]     = value;
+        out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+      }
+    }
+  };
+
+  vectorized_process(static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x,
+                     static_cast<size_t>(blockDim.x) * gridDim.x,
+                     in_buf,
+                     previous_len,
+                     f);
+}
+
 /**
  *
  * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
@@ -350,35 +517,79 @@ _RAFT_DEVICE void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, cons
  *
  * In the implementation, the filtering step is delayed to the next pass so the filtering and
  * histogram computation are fused. In this way, inputs are read once rather than twice.
+ *
+ * During the filtering step, we won't write candidates (elements in bucket j) to `out_buf` if the
+ * number of candidates is larger than the length of `out_buf` (this could happen when the leading
+ * bits of input values are almost the same). And then in the next pass, inputs are read from `in`
+ * rather than from `in_buf`. The benefit is that we can save the cost of writing candidates and
+ * their indices.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-__global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
-                                                          const IdxT* in_idx_buf,
-                                                          T* out_buf,
-                                                          IdxT* out_idx_buf,
-                                                          T* out,
-                                                          IdxT* out_idx,
-                                                          Counter<T, IdxT>* counters,
-                                                          IdxT* histograms,
-                                                          const IdxT len,
-                                                          const int k,
-                                                          const bool greater,
-                                                          const int pass)
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
+__global__ void radix_kernel(const T* in,
+                             const IdxT* in_idx,
+                             const T* in_buf,
+                             const IdxT* in_idx_buf,
+                             T* out_buf,
+                             IdxT* out_idx_buf,
+                             T* out,
+                             IdxT* out_idx,
+                             Counter<T, IdxT>* counters,
+                             IdxT* histograms,
+                             const IdxT len,
+                             const IdxT k,
+                             const bool select_min,
+                             const int pass)
 {
-  __shared__ bool isLastBlockDone;
+  const size_t batch_id = blockIdx.y;
+  auto counter          = counters + batch_id;
+  IdxT current_k;
+  IdxT previous_len;
+  IdxT current_len;
+  if (pass == 0) {
+    current_k    = k;
+    previous_len = len;
+    // Need to do this so setting counter->previous_len for the next pass is correct.
+    // This value is meaningless for pass 0, but it's fine because pass 0 won't be the
+    // last pass in this implementation so pass 0 won't hit the "if (pass ==
+    // num_passes - 1)" branch.
+    // Maybe it's better to reload counter->previous_len and use it rather than
+    // current_len in last_filter()
+    current_len = len;
+  } else {
+    current_k    = counter->k;
+    current_len  = counter->len;
+    previous_len = counter->previous_len;
+  }
+  if (current_len == 0) { return; }
 
-  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
-  constexpr int num_passes  = calc_num_passes<T, BitsPerPass>();
-  const int batch_id        = blockIdx.y;
-  in_buf += batch_id * len;
-  out_buf += batch_id * len;
+  // When k=len, early_stop will be true at pass 0. It means filter_and_histogram() should handle
+  // correctly the case that pass=0 and early_stop=true. However, this special case of k=len is
+  // handled in other way in select_k() so such case is not possible here.
+  const bool early_stop = (current_len == current_k);
+  const IdxT buf_len    = calc_buf_len<T>(len);
+
+  // "previous_len > buf_len" means previous pass skips writing buffer
+  if (pass == 0 || pass == 1 || previous_len > buf_len) {
+    in_buf       = in + batch_id * len;
+    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
+    previous_len = len;
+  } else {
+    in_buf += batch_id * buf_len;
+    in_idx_buf += batch_id * buf_len;
+  }
+  // "current_len > buf_len" means current pass will skip writing buffer
+  if (pass == 0 || current_len > buf_len) {
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else {
+    out_buf += batch_id * buf_len;
+    out_idx_buf += batch_id * buf_len;
+  }
   out += batch_id * k;
   out_idx += batch_id * k;
-  if (in_idx_buf) { in_idx_buf += batch_id * len; }
-  if (out_idx_buf) { out_idx_buf += batch_id * len; }
 
-  auto counter   = counters + batch_id;
-  auto histogram = histograms + batch_id * num_buckets;
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  auto histogram            = histograms + batch_id * num_buckets;
 
   filter_and_histogram<T, IdxT, BitsPerPass>(in_buf,
                                              in_idx_buf,
@@ -386,126 +597,468 @@ __global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
                                              out_idx_buf,
                                              out,
                                              out_idx,
-                                             len,
+                                             previous_len,
                                              counter,
                                              histogram,
-                                             greater,
+                                             select_min,
                                              pass,
-                                             k);
+                                             early_stop);
   __threadfence();
 
+  bool isLastBlock = false;
   if (threadIdx.x == 0) {
     unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
-    isLastBlockDone       = (finished == (gridDim.x - 1));
+    isLastBlock           = (finished == (gridDim.x - 1));
   }
 
-  // Synchronize to make sure that each thread reads the correct value of
-  // isLastBlockDone.
-  __syncthreads();
-  if (isLastBlockDone) {
-    if (counter->len == 1 && threadIdx.x == 0) {
-      counter->previous_len = 0;
-      counter->len          = 0;
-    }
-    // init counter, other members of counter is initialized with 0 by
-    // cudaMemset()
-    if (pass == 0 && threadIdx.x == 0) {
-      counter->k            = k;
-      counter->len          = len;
-      counter->out_back_cnt = 0;
+  if (__syncthreads_or(isLastBlock)) {
+    if (early_stop) {
+      if (threadIdx.x == 0) {
+        // `last_filter_kernel()` requires setting previous_len
+        counter->previous_len = 0;
+        counter->len          = 0;
+      }
+      return;
     }
+
+    scan<IdxT, BitsPerPass, BlockSize>(histogram);
     __syncthreads();
+    choose_bucket<T, IdxT, BitsPerPass>(counter, histogram, current_k, pass);
+    __syncthreads();
+
+    constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
+    // reset for next pass
+    if (pass != num_passes - 1) {
+      for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+        histogram[i] = 0;
+      }
+    }
+    if (threadIdx.x == 0) {
+      // `last_filter_kernel()` requires setting previous_len even in the last pass
+      counter->previous_len = current_len;
+      // not necessary for the last pass, but put it here anyway
+      counter->filter_cnt = 0;
+    }
+
+    if constexpr (fused_last_filter) {
+      if (pass == num_passes - 1) {
+        last_filter<T, IdxT, BitsPerPass>(out_buf ? out_buf : in_buf,
+                                          out_idx_buf ? out_idx_buf : in_idx_buf,
+                                          out,
+                                          out_idx,
+                                          out_buf ? current_len : len,
+                                          k,
+                                          counter,
+                                          select_min,
+                                          pass);
+      }
+    }
+  }
+}
+
+template <typename T, typename IdxT, int BlockSize, typename Kernel>
+int calc_chunk_size(int batch_size, IdxT len, int sm_cnt, Kernel kernel)
+{
+  int active_blocks;
+  RAFT_CUDA_TRY(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&active_blocks, kernel, BlockSize, 0));
+
+  constexpr int items_per_thread = 32;
+  constexpr int num_waves        = 10;
+  int chunk_size =
+    std::max<int>(1, num_waves * sm_cnt * active_blocks * BlockSize * items_per_thread / len);
+  return std::min(chunk_size, batch_size);
+}
 
-    IdxT ori_k = counter->k;
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
+{
+  static_assert(VECTORIZED_READ_SIZE / sizeof(T) >= 1);
+
+  int active_blocks;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>, BlockSize, 0));
+  active_blocks *= sm_cnt;
 
-    if (counter->len > 0) {
-      choose_bucket<T, IdxT, BitsPerPass, BlockSize>(counter, histogram, ori_k);
+  IdxT best_num_blocks         = 0;
+  float best_tail_wave_penalty = 1.0f;
+  const IdxT max_num_blocks    = ceildiv<IdxT>(len, VECTORIZED_READ_SIZE / sizeof(T) * BlockSize);
+  for (int num_waves = 1;; ++num_waves) {
+    IdxT num_blocks = std::min(
+      max_num_blocks, static_cast<IdxT>(std::max(num_waves * active_blocks / batch_size, 1)));
+    IdxT items_per_thread  = ceildiv<IdxT>(len, num_blocks * BlockSize);
+    items_per_thread       = alignTo<IdxT>(items_per_thread, VECTORIZED_READ_SIZE / sizeof(T));
+    num_blocks             = ceildiv<IdxT>(len, items_per_thread * BlockSize);
+    float actual_num_waves = static_cast<float>(num_blocks) * batch_size / active_blocks;
+    float tail_wave_penalty =
+      (ceilf(actual_num_waves) - actual_num_waves) / ceilf(actual_num_waves);
+
+    // 0.15 is determined experimentally. It also ensures breaking the loop early,
+    // e.g. when num_waves > 7, tail_wave_penalty will always <0.15
+    if (tail_wave_penalty < 0.15) {
+      best_num_blocks = num_blocks;
+      break;
+    } else if (tail_wave_penalty < best_tail_wave_penalty) {
+      best_num_blocks        = num_blocks;
+      best_tail_wave_penalty = tail_wave_penalty;
     }
 
-    __syncthreads();
-    if (pass == num_passes - 1) {
-      const IdxT previous_len = counter->previous_len;
-      const int want_bucket   = counter->bucket;
-      int start_bit           = calc_start_bit<T, BitsPerPass>(pass);
-      unsigned mask           = calc_mask<T, BitsPerPass>(pass);
-
-      // radix topk
-      IdxT& out_cnt = counter->out_cnt;
-      for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
-        const T value = out_buf[i];
-        int bucket    = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
-        if (bucket < want_bucket) {
-          IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
-          out[pos]     = value;
-          out_idx[pos] = out_idx_buf[i];
-        } else if (bucket == want_bucket) {
-          IdxT needed_num_of_kth = counter->k;
-          IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), IdxT(1));
-          if (back_pos < needed_num_of_kth) {
-            IdxT pos     = k - 1 - back_pos;
-            out[pos]     = value;
-            out_idx[pos] = out_idx_buf[i];
-          }
-        }
+    if (num_blocks == max_num_blocks) { break; }
+  }
+  return best_num_blocks;
+}
+
+template <typename T, typename IdxT>
+_RAFT_HOST_DEVICE void set_buf_pointers(const T* in,
+                                        const IdxT* in_idx,
+                                        T* buf1,
+                                        IdxT* idx_buf1,
+                                        T* buf2,
+                                        IdxT* idx_buf2,
+                                        int pass,
+                                        const T*& in_buf,
+                                        const IdxT*& in_idx_buf,
+                                        T*& out_buf,
+                                        IdxT*& out_idx_buf)
+{
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = buf1;
+    out_idx_buf = idx_buf1;
+  } else if (pass % 2 == 0) {
+    in_buf      = buf1;
+    in_idx_buf  = idx_buf1;
+    out_buf     = buf2;
+    out_idx_buf = idx_buf2;
+  } else {
+    in_buf      = buf2;
+    in_idx_buf  = idx_buf2;
+    out_buf     = buf1;
+    out_idx_buf = idx_buf1;
+  }
+}
+
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+void radix_topk(const T* in,
+                const IdxT* in_idx,
+                int batch_size,
+                IdxT len,
+                IdxT k,
+                T* out,
+                IdxT* out_idx,
+                bool select_min,
+                bool fused_last_filter,
+                unsigned grid_dim,
+                int sm_cnt,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr)
+{
+  // TODO: is it possible to relax this restriction?
+  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+
+  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+  const size_t max_chunk_size =
+    calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel);
+  if (max_chunk_size != static_cast<size_t>(batch_size)) {
+    grid_dim = calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(max_chunk_size, len, sm_cnt);
+  }
+  const IdxT buf_len = calc_buf_len<T>(len);
+
+  size_t req_aux = max_chunk_size * (sizeof(Counter<T, IdxT>) + num_buckets * sizeof(IdxT));
+  size_t req_buf = max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
+  size_t mem_req = req_aux + req_buf + 256 * 6;  // might need extra memory for alignment
+
+  auto pool_guard = raft::get_pool_memory_resource(mr, mem_req);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
+  rmm::device_uvector<T> buf1(max_chunk_size * buf_len, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * buf_len, stream, mr);
+  rmm::device_uvector<T> buf2(max_chunk_size * buf_len, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * buf_len, stream, mr);
+
+  for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
+    int chunk_size = std::min(max_chunk_size, batch_size - offset);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
+
+    const T* chunk_in        = in + offset * len;
+    const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
+    T* chunk_out             = out + offset * k;
+    IdxT* chunk_out_idx      = out_idx + offset * k;
+
+    const T* in_buf        = nullptr;
+    const IdxT* in_idx_buf = nullptr;
+    T* out_buf             = nullptr;
+    IdxT* out_idx_buf      = nullptr;
+
+    dim3 blocks(grid_dim, chunk_size);
+    constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
+
+    for (int pass = 0; pass < num_passes; ++pass) {
+      set_buf_pointers(chunk_in,
+                       chunk_in_idx,
+                       buf1.data(),
+                       idx_buf1.data(),
+                       buf2.data(),
+                       idx_buf2.data(),
+                       pass,
+                       in_buf,
+                       in_idx_buf,
+                       out_buf,
+                       out_idx_buf);
+
+      if (fused_last_filter && pass == num_passes - 1) {
+        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true>;
       }
-      __syncthreads();
-    } else {
-      // reset for next pass
-      for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
-        histogram[i] = 0;
+
+      kernel<<<blocks, BlockSize, 0, stream>>>(chunk_in,
+                                               chunk_in_idx,
+                                               in_buf,
+                                               in_idx_buf,
+                                               out_buf,
+                                               out_idx_buf,
+                                               chunk_out,
+                                               chunk_out_idx,
+                                               counters.data(),
+                                               histograms.data(),
+                                               len,
+                                               k,
+                                               select_min,
+                                               pass);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+    if (!fused_last_filter) {
+      last_filter_kernel<T, IdxT, BitsPerPass><<<blocks, BlockSize, 0, stream>>>(chunk_in,
+                                                                                 chunk_in_idx,
+                                                                                 out_buf,
+                                                                                 out_idx_buf,
+                                                                                 chunk_out,
+                                                                                 chunk_out_idx,
+                                                                                 len,
+                                                                                 k,
+                                                                                 counters.data(),
+                                                                                 select_min);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+}
+
+// The following a few functions are for the one-block version, which uses single thread block for
+// each row of a batch.
+template <typename T, typename IdxT, int BitsPerPass>
+_RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
+                                                     const IdxT* in_idx_buf,
+                                                     T* out_buf,
+                                                     IdxT* out_idx_buf,
+                                                     T* out,
+                                                     IdxT* out_idx,
+                                                     Counter<T, IdxT>* counter,
+                                                     IdxT* histogram,
+                                                     bool select_min,
+                                                     int pass)
+{
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    histogram[i] = 0;
+  }
+  IdxT* p_filter_cnt = &counter->filter_cnt;
+  if (threadIdx.x == 0) { *p_filter_cnt = 0; }
+  __syncthreads();
+
+  const int start_bit     = calc_start_bit<T, BitsPerPass>(pass);
+  const unsigned mask     = calc_mask<T, BitsPerPass>(pass);
+  const IdxT previous_len = counter->previous_len;
+
+  if (pass == 0) {
+    auto f = [histogram, select_min, start_bit, mask](T value, IdxT) {
+      int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, select_min);
+      atomicAdd(histogram + bucket, static_cast<IdxT>(1));
+    };
+    vectorized_process(threadIdx.x, blockDim.x, in_buf, previous_len, f);
+  } else {
+    // not use vectorized_process here because it increases #registers a lot
+    IdxT* p_out_cnt              = &counter->out_cnt;
+    const auto kth_value_bits    = counter->kth_value_bits;
+    const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
+
+    for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+      const T value            = in_buf[i];
+      const auto previous_bits = (twiddle_in(value, select_min) >> previous_start_bit)
+                                 << previous_start_bit;
+      if (previous_bits == kth_value_bits) {
+#if CUDART_VERSION < 12000
+        // Avoiding potential compiler bug in CUDA 11
+        volatile
+#endif
+          IdxT pos       = atomicAdd(p_filter_cnt, static_cast<IdxT>(1));
+        out_buf[pos]     = value;
+        out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i;
+
+        int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, select_min);
+        atomicAdd(histogram + bucket, static_cast<IdxT>(1));
+      } else if (previous_bits < kth_value_bits) {
+        IdxT pos     = atomicAdd(p_out_cnt, static_cast<IdxT>(1));
+        out[pos]     = value;
+        out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
       }
-      if (threadIdx.x == 0) { counter->filter_cnt = 0; }
     }
   }
 }
 
-/**
- * Calculate the minimal batch size, such that GPU is still fully occupied.
- */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
+__global__ void radix_topk_one_block_kernel(const T* in,
+                                            const IdxT* in_idx,
+                                            const IdxT len,
+                                            const IdxT k,
+                                            T* out,
+                                            IdxT* out_idx,
+                                            const bool select_min,
+                                            T* buf1,
+                                            IdxT* idx_buf1,
+                                            T* buf2,
+                                            IdxT* idx_buf2)
 {
-  int dev_id, sm_count, occupancy, max_grid_dim_y;
-  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id));
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&max_grid_dim_y, cudaDevAttrMaxGridDimY, dev_id));
-  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &occupancy, radix_kernel<T, IdxT, BitsPerPass, BlockSize>, BlockSize, 0));
-
-  // number of block we'd use if the batch size is enough to occupy the gpu in any case
-  size_t blocks_per_row = ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD);
-
-  // fully occupy GPU
-  size_t opt_batch_size = ceildiv<size_t>(sm_count * occupancy, blocks_per_row);
-  // round it up to the closest pow-of-two for better data alignment
-  opt_batch_size = isPo2(opt_batch_size) ? opt_batch_size : (1 << (log2(opt_batch_size) + 1));
-  // Take a max possible pow-of-two grid_dim_y
-  max_grid_dim_y = isPo2(max_grid_dim_y) ? max_grid_dim_y : (1 << log2(max_grid_dim_y));
-  // If the optimal batch size is very small compared to the requested batch size, we know
-  // the extra required memory is not significant and we can increase the batch size for
-  // better occupancy when the grid size is not multiple of the SM count.
-  // Also don't split the batch size when there is not much work overall.
-  const size_t safe_enlarge_factor = 9;
-  const size_t min_grid_size       = 1024;
-  while ((opt_batch_size << safe_enlarge_factor) < req_batch_size ||
-         blocks_per_row * opt_batch_size < min_grid_size) {
-    opt_batch_size <<= 1;
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  __shared__ Counter<T, IdxT> counter;
+  __shared__ IdxT histogram[num_buckets];
+
+  if (threadIdx.x == 0) {
+    counter.k              = k;
+    counter.len            = len;
+    counter.previous_len   = len;
+    counter.kth_value_bits = 0;
+    counter.out_cnt        = 0;
+    counter.out_back_cnt   = 0;
   }
+  __syncthreads();
+
+  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
+  in += batch_id * len;
+  if (in_idx) { in_idx += batch_id * len; }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+  buf1 += batch_id * len;
+  idx_buf1 += batch_id * len;
+  buf2 += batch_id * len;
+  idx_buf2 += batch_id * len;
+  const T* in_buf        = nullptr;
+  const IdxT* in_idx_buf = nullptr;
+  T* out_buf             = nullptr;
+  IdxT* out_idx_buf      = nullptr;
+
+  constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
+  for (int pass = 0; pass < num_passes; ++pass) {
+    set_buf_pointers(
+      in, in_idx, buf1, idx_buf1, buf2, idx_buf2, pass, in_buf, in_idx_buf, out_buf, out_idx_buf);
+
+    IdxT current_len = counter.len;
+    IdxT current_k   = counter.k;
+
+    filter_and_histogram_for_one_block<T, IdxT, BitsPerPass>(in_buf,
+                                                             in_idx_buf,
+                                                             out_buf,
+                                                             out_idx_buf,
+                                                             out,
+                                                             out_idx,
+                                                             &counter,
+                                                             histogram,
+                                                             select_min,
+                                                             pass);
+    __syncthreads();
+
+    scan<IdxT, BitsPerPass, BlockSize>(histogram);
+    __syncthreads();
+
+    choose_bucket<T, IdxT, BitsPerPass>(&counter, histogram, current_k, pass);
+    if (threadIdx.x == 0) { counter.previous_len = current_len; }
+    __syncthreads();
 
-  // Do not exceed the max grid size.
-  opt_batch_size = std::min<size_t>(opt_batch_size, size_t(max_grid_dim_y));
-  // Don't do more work than needed
-  opt_batch_size = std::min<size_t>(opt_batch_size, req_batch_size);
-  // Let more blocks share one row if the required batch size is too small.
-  while (opt_batch_size * blocks_per_row < size_t(sm_count * occupancy) &&
-         // Ensure we still can read data somewhat efficiently
-         len * sizeof(T) > 2 * VECTORIZED_READ_SIZE * BlockSize * blocks_per_row) {
-    blocks_per_row <<= 1;
+    if (counter.len == counter.k || pass == num_passes - 1) {
+      last_filter<T, IdxT, BitsPerPass>(pass == 0 ? in : out_buf,
+                                        pass == 0 ? in_idx : out_idx_buf,
+                                        out,
+                                        out_idx,
+                                        current_len,
+                                        k,
+                                        &counter,
+                                        select_min,
+                                        pass);
+      break;
+    }
   }
+}
 
-  return dim3(blocks_per_row, opt_batch_size);
+// radix_topk() might use multiple thread blocks for one row of a batch. In contrast, the following
+// one-block version uses single thread block for one row of a batch, so intermediate data, like
+// counters and global histograms, can be kept in shared memory and cheap sync operations can be
+// used. It's used when len is relatively small or when the number of blocks per row calculated by
+// `calc_grid_dim()` is 1.
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+void radix_topk_one_block(const T* in,
+                          const IdxT* in_idx,
+                          int batch_size,
+                          IdxT len,
+                          IdxT k,
+                          T* out,
+                          IdxT* out_idx,
+                          bool select_min,
+                          int sm_cnt,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr)
+{
+  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
+
+  auto kernel = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  const size_t max_chunk_size =
+    calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel);
+
+  auto pool_guard =
+    raft::get_pool_memory_resource(mr,
+                                   max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT)) +
+                                     256 * 4  // might need extra memory for alignment
+    );
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
+
+  for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
+    int chunk_size = std::min(max_chunk_size, batch_size - offset);
+    kernel<<<chunk_size, BlockSize, 0, stream>>>(in + offset * len,
+                                                 in_idx ? (in_idx + offset * len) : nullptr,
+                                                 len,
+                                                 k,
+                                                 out + offset * k,
+                                                 out_idx + offset * k,
+                                                 select_min,
+                                                 buf1.data(),
+                                                 idx_buf1.data(),
+                                                 buf2.data(),
+                                                 idx_buf2.data());
+  }
 }
 
+}  // namespace impl
+
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
@@ -546,6 +1099,12 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
+ * @param fused_last_filter
+ *   when it's true, the last filter is fused into the kernel in the last pass and only one thread
+ *   block will do the filtering; when false, a standalone filter kernel with multiple thread
+ *   blocks is called. The later case is preferable when leading bits of input data are almost the
+ *   same. That is, when the value range of input data is narrow. In such case, there could be a
+ *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
  * @param stream
  * @param mr an optional memory resource to use across the calls (you can provide a large enough
  *           memory pool here to avoid memory allocations within the call).
@@ -553,109 +1112,65 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void select_k(const T* in,
               const IdxT* in_idx,
-              size_t batch_size,
-              size_t len,
-              int k,
+              int batch_size,
+              IdxT len,
+              IdxT k,
               T* out,
               IdxT* out_idx,
               bool select_min,
+              bool fused_last_filter,
               rmm::cuda_stream_view stream,
               rmm::mr::device_memory_resource* mr = nullptr)
 {
-  // reduce the block size if the input length is too small.
-  if constexpr (BlockSize > calc_min_block_size<BitsPerPass>()) {
-    if (BlockSize * ITEM_PER_THREAD > len) {
-      return select_k<T, IdxT, BitsPerPass, BlockSize / 2>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  if (k == len) {
+    RAFT_CUDA_TRY(
+      cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
+    if (in_idx) {
+      RAFT_CUDA_TRY(cudaMemcpyAsync(
+        out_idx, in_idx, sizeof(IdxT) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
+    } else {
+      auto out_idx_view =
+        raft::make_device_vector_view(out_idx, static_cast<size_t>(len) * batch_size);
+      raft::device_resources handle(stream);
+      raft::linalg::map_offset(handle, out_idx_view, raft::mod_const_op<IdxT>(len));
     }
+    return;
   }
 
-  // TODO: is it possible to relax this restriction?
-  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
-  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
-
-  dim3 blocks           = get_optimal_grid_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, len);
-  size_t max_chunk_size = blocks.y;
-
-  size_t req_aux = max_chunk_size * (sizeof(Counter<T, IdxT>) + num_buckets * sizeof(IdxT));
-  size_t req_buf = max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT));
-  size_t mem_req = req_aux + req_buf;
-  size_t mem_free, mem_total;
-  RAFT_CUDA_TRY(cudaMemGetInfo(&mem_free, &mem_total));
-  std::optional<rmm::mr::managed_memory_resource> managed_memory;
-  rmm::mr::device_memory_resource* mr_buf = nullptr;
-  if (mem_req > mem_free) {
-    // if there's not enough memory for buffers on the device, resort to the managed memory.
-    mem_req = req_aux;
-    managed_memory.emplace();
-    mr_buf = &managed_memory.value();
-  }
-
-  auto pool_guard = raft::get_pool_memory_resource(mr, mem_req);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
+  // TODO: use device_resources::get_device_properties() instead; should change it when we refactor
+  // resource management
+  int sm_cnt;
+  {
+    int dev;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev));
+    RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_cnt, cudaDevAttrMultiProcessorCount, dev));
   }
-  if (mr_buf == nullptr) { mr_buf = mr; }
-
-  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
-  rmm::device_uvector<T> buf1(max_chunk_size * len, stream, mr_buf);
-  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * len, stream, mr_buf);
-  rmm::device_uvector<T> buf2(max_chunk_size * len, stream, mr_buf);
-  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * len, stream, mr_buf);
 
-  for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
-    blocks.y = std::min(max_chunk_size, batch_size - offset);
+  constexpr int items_per_thread = 32;
 
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-
-    const T* in_buf        = nullptr;
-    const IdxT* in_idx_buf = nullptr;
-    T* out_buf             = nullptr;
-    IdxT* out_idx_buf      = nullptr;
-
-    constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
-
-    for (int pass = 0; pass < num_passes; ++pass) {
-      if (pass == 0) {
-        in_buf      = in + offset * len;
-        in_idx_buf  = nullptr;
-        out_buf     = nullptr;
-        out_idx_buf = nullptr;
-      } else if (pass == 1) {
-        in_buf      = in + offset * len;
-        in_idx_buf  = in_idx ? in_idx + offset * len : nullptr;
-        out_buf     = buf1.data();
-        out_idx_buf = idx_buf1.data();
-      } else if (pass % 2 == 0) {
-        in_buf      = buf1.data();
-        in_idx_buf  = idx_buf1.data();
-        out_buf     = buf2.data();
-        out_idx_buf = idx_buf2.data();
-      } else {
-        in_buf      = buf2.data();
-        in_idx_buf  = idx_buf2.data();
-        out_buf     = buf1.data();
-        out_idx_buf = idx_buf1.data();
-      }
-
-      radix_kernel<T, IdxT, BitsPerPass, BlockSize>
-        <<<blocks, BlockSize, 0, stream>>>(in_buf,
-                                           in_idx_buf,
-                                           out_buf,
-                                           out_idx_buf,
-                                           out + offset * k,
-                                           out_idx + offset * k,
-                                           counters.data(),
-                                           histograms.data(),
-                                           len,
-                                           k,
-                                           !select_min,
-                                           pass);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
+  if (len <= BlockSize * items_per_thread) {
+    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+      in, in_idx, batch_size, len, k, out, out_idx, select_min, sm_cnt, stream, mr);
+  } else {
+    unsigned grid_dim =
+      impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
+    if (grid_dim == 1) {
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, sm_cnt, stream, mr);
+    } else {
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize>(in,
+                                                        in_idx,
+                                                        batch_size,
+                                                        len,
+                                                        k,
+                                                        out,
+                                                        out_idx,
+                                                        select_min,
+                                                        fused_last_filter,
+                                                        grid_dim,
+                                                        sm_cnt,
+                                                        stream,
+                                                        mr);
     }
   }
 }
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index 4891cc5f8d..dac1a29c7f 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -122,9 +122,8 @@ inline void knn_merge_parts(
  *
  *  raft::raft::device_resources handle;
  *  ...
- *  int k = 10;
  *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- *  brute_force::knn(handle, index, search, indices, distances, k, metric);
+ *  brute_force::knn(handle, index, search, indices, distances, metric);
  * @endcode
  *
  * @param[in] handle: the cuml handle to use
@@ -132,28 +131,31 @@ inline void knn_merge_parts(
  * @param[in] search: matrix (size n*d) to be used for searching the index
  * @param[out] indices: matrix (size n*k) to store output knn indices
  * @param[out] distances: matrix (size n*k) to store the output knn distance
- * @param[in] k: the number of nearest neighbors to return
  * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
  * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
  * 					 is ignored if the metric_type is not Minkowski.
  * @param[in] global_id_offset: optional starting global id mapping for the local partition
  *                              (assumes the index contains contiguous ids in the global id space)
+ * @param[in] distance_epilogue: optional epilogue function to run after computing distances. This
+                                 function takes a triple of the (value, rowid, colid) for each
+                                 element in the pairwise distances and returns a transformed value
+                                 back.
  */
 template <typename idx_t,
           typename value_t,
-          typename value_int,
           typename matrix_idx,
           typename index_layout,
-          typename search_layout>
+          typename search_layout,
+          typename epilogue_op = raft::identity_op>
 void knn(raft::device_resources const& handle,
          std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
          raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
          raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
          raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
-         value_int k,
          distance::DistanceType metric         = distance::DistanceType::L2Unexpanded,
          std::optional<float> metric_arg       = std::make_optional<float>(2.0f),
-         std::optional<idx_t> global_id_offset = std::nullopt)
+         std::optional<idx_t> global_id_offset = std::nullopt,
+         epilogue_op distance_epilogue         = raft::identity_op())
 {
   RAFT_EXPECTS(index[0].extent(1) == search.extent(1),
                "Number of dimensions for both index and search matrices must be equal");
@@ -161,15 +163,14 @@ void knn(raft::device_resources const& handle,
   RAFT_EXPECTS(indices.extent(0) == distances.extent(0) && distances.extent(0) == search.extent(0),
                "Number of rows in output indices and distances matrices must equal number of rows "
                "in search matrix.");
-  RAFT_EXPECTS(
-    indices.extent(1) == distances.extent(1) && distances.extent(1) == static_cast<matrix_idx>(k),
-    "Number of columns in output indices and distances matrices must be equal to k");
+  RAFT_EXPECTS(indices.extent(1) == distances.extent(1) && distances.extent(1),
+               "Number of columns in output indices and distances matrices must the same");
 
   bool rowMajorIndex = std::is_same_v<index_layout, layout_c_contiguous>;
   bool rowMajorQuery = std::is_same_v<search_layout, layout_c_contiguous>;
 
   std::vector<value_t*> inputs;
-  std::vector<value_int> sizes;
+  std::vector<matrix_idx> sizes;
   for (std::size_t i = 0; i < index.size(); ++i) {
     inputs.push_back(const_cast<value_t*>(index[i].data_handle()));
     sizes.push_back(index[i].extent(0));
@@ -183,18 +184,19 @@ void knn(raft::device_resources const& handle,
   raft::neighbors::detail::brute_force_knn_impl(handle,
                                                 inputs,
                                                 sizes,
-                                                static_cast<value_int>(index[0].extent(1)),
+                                                index[0].extent(1),
                                                 // TODO: This is unfortunate. Need to fix.
                                                 const_cast<value_t*>(search.data_handle()),
-                                                static_cast<value_int>(search.extent(0)),
+                                                search.extent(0),
                                                 indices.data_handle(),
                                                 distances.data_handle(),
-                                                k,
+                                                indices.extent(1),
                                                 rowMajorIndex,
                                                 rowMajorQuery,
                                                 trans_arg,
                                                 metric,
-                                                metric_arg.value_or(2.0f));
+                                                metric_arg.value_or(2.0f),
+                                                distance_epilogue);
 }
 
 /**
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
index f657070df4..e6533eaf51 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
@@ -1065,6 +1065,14 @@ void ivfflat_interleaved_scan(const index<T, IdxT>& index,
                               uint32_t& grid_dim_x,
                               rmm::cuda_stream_view stream)
 {
+  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
+  // function is used in both raft::neighbors::ivf_flat::search and
+  // raft::neighbors::detail::refine_device. To prevent a duplicate
+  // instantiation of this function (which defines ~270 kernels) in the refine
+  // specializations, an extern template definition is provided. Please check
+  // related function calls after editing this function definition. Search for
+  // `greppable-id-specializations-ivf-flat-search` to find them.
+
   const int capacity = bound_by_power_of_two(k);
   select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
                                                      index.veclen(),
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index 875fc3b37c..a776ce2586 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -47,7 +47,9 @@ using namespace raft::spatial::knn;
  * Calculates brute force knn, using a fixed memory budget
  * by tiling over both the rows and columns of pairwise_distances
  */
-template <typename ElementType = float, typename IndexType = int64_t>
+template <typename ElementType      = float,
+          typename IndexType        = int64_t,
+          typename DistanceEpilogue = raft::identity_op>
 void tiled_brute_force_knn(const raft::device_resources& handle,
                            const ElementType* search,  // size (m ,d)
                            const ElementType* index,   // size (n ,d)
@@ -58,9 +60,10 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
                            ElementType* distances,  // size (m, k)
                            IndexType* indices,      // size (m, k)
                            raft::distance::DistanceType metric,
-                           float metric_arg         = 0.0,
-                           size_t max_row_tile_size = 0,
-                           size_t max_col_tile_size = 0)
+                           float metric_arg                   = 2.0,
+                           size_t max_row_tile_size           = 0,
+                           size_t max_col_tile_size           = 0,
+                           DistanceEpilogue distance_epilogue = raft::identity_op())
 {
   // Figure out the number of rows/cols to tile for
   size_t tile_rows   = 0;
@@ -152,25 +155,41 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
                                                     metric_arg);
       if (metric == raft::distance::DistanceType::L2Expanded ||
           metric == raft::distance::DistanceType::L2SqrtExpanded) {
-        auto row_norms = search_norms.data() + i;
-        auto col_norms = index_norms.data() + j;
+        auto row_norms = search_norms.data();
+        auto col_norms = index_norms.data();
         auto dist      = temp_distances.data();
 
         raft::linalg::map_offset(
           handle,
           raft::make_device_vector_view(dist, current_query_size * current_centroid_size),
-          [=] __device__(IndexType i) {
-            IndexType row = i / current_centroid_size, col = i % current_centroid_size;
+          [=] __device__(IndexType idx) {
+            IndexType row = i + (idx / current_centroid_size);
+            IndexType col = j + (idx % current_centroid_size);
 
-            auto val = row_norms[row] + col_norms[col] - 2.0 * dist[i];
+            auto val = row_norms[row] + col_norms[col] - 2.0 * dist[idx];
 
             // due to numerical instability (especially around self-distance)
             // the distances here could be slightly negative, which will
             // cause NaN values in the subsequent sqrt. Clamp to 0
             val = val * (val >= 0.0001);
             if (metric == raft::distance::DistanceType::L2SqrtExpanded) { val = sqrt(val); }
+            val = distance_epilogue(val, row, col);
             return val;
           });
+      } else {
+        // if we're not l2 distance, and we have a distance epilogue - run it now
+        if constexpr (!std::is_same_v<DistanceEpilogue, raft::identity_op>) {
+          auto distances_ptr = temp_distances.data();
+          raft::linalg::map_offset(
+            handle,
+            raft::make_device_vector_view(temp_distances.data(),
+                                          current_query_size * current_centroid_size),
+            [=] __device__(size_t idx) {
+              IndexType row = i + (idx / current_centroid_size);
+              IndexType col = j + (idx % current_centroid_size);
+              return distance_epilogue(distances_ptr[idx], row, col);
+            });
+        }
       }
 
       select_k<IndexType, ElementType>(temp_distances.data(),
@@ -250,7 +269,10 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
  * @param[in] metric corresponds to the raft::distance::DistanceType enum (default is L2Expanded)
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
-template <typename IntType = int, typename IdxType = std::int64_t, typename value_t = float>
+template <typename IntType          = int,
+          typename IdxType          = std::int64_t,
+          typename value_t          = float,
+          typename DistanceEpilogue = raft::identity_op>
 void brute_force_knn_impl(
   raft::device_resources const& handle,
   std::vector<value_t*>& input,
@@ -265,7 +287,8 @@ void brute_force_knn_impl(
   bool rowMajorQuery                  = true,
   std::vector<IdxType>* translations  = nullptr,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-  float metricArg                     = 0)
+  float metricArg                     = 0,
+  DistanceEpilogue distance_epilogue  = raft::identity_op())
 {
   auto userStream = handle.get_stream();
 
@@ -355,6 +378,7 @@ void brute_force_knn_impl(
     auto stream = handle.get_next_usable_stream(i);
 
     if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+        std::is_same_v<DistanceEpilogue, raft::identity_op> &&
         (metric == raft::distance::DistanceType::L2Unexpanded ||
          metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
          metric == raft::distance::DistanceType::L2Expanded ||
@@ -424,7 +448,10 @@ void brute_force_knn_impl(
                                                   out_d_ptr,
                                                   out_i_ptr,
                                                   tiled_metric,
-                                                  metricArg);
+                                                  metricArg,
+                                                  0,
+                                                  0,
+                                                  distance_epilogue);
           break;
       }
     }
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index f244d5875c..aedfc42698 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -117,6 +117,14 @@ void refine_device(raft::device_resources const& handle,
                                                            n_queries,
                                                            n_candidates);
 
+  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
+  // function is used in both raft::neighbors::ivf_flat::search and
+  // raft::neighbors::detail::refine_device. To prevent a duplicate
+  // instantiation of this function (which defines ~270 kernels) in the refine
+  // specializations, an extern template definition is provided. Please check
+  // and adjust the extern template definition and the instantiation when the
+  // below function call is edited. Search for
+  // `greppable-id-specializations-ivf-flat-search` to find them.
   uint32_t grid_dim_x = 1;
   raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<
     data_t,
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 27105b6eab..9da5649ef8 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -16,10 +16,14 @@
 
 #pragma once
 
+#include <raft/neighbors/specializations/ball_cover.cuh>
+#include <raft/neighbors/specializations/brute_force.cuh>
+#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+
 #include <raft/neighbors/specializations/ivf_flat.cuh>
 #include <raft/neighbors/specializations/ivf_pq.cuh>
 #include <raft/neighbors/specializations/refine.cuh>
 
 #include <raft/cluster/specializations.cuh>
 #include <raft/distance/specializations.cuh>
-#include <raft/matrix/specializations.cuh>
\ No newline at end of file
+#include <raft/matrix/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/knn.cuh b/cpp/include/raft/neighbors/specializations/brute_force.cuh
similarity index 60%
rename from cpp/include/raft/neighbors/specializations/knn.cuh
rename to cpp/include/raft/neighbors/specializations/brute_force.cuh
index e0b64415fe..1337beb68a 100644
--- a/cpp/include/raft/neighbors/specializations/knn.cuh
+++ b/cpp/include/raft/neighbors/specializations/brute_force.cuh
@@ -17,31 +17,6 @@
 #pragma once
 
 #include <raft/neighbors/brute_force.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft::spatial::knn {
-#define RAFT_INST(IdxT, T, IntT)                                                            \
-  extern template void brute_force_knn<IdxT, T, IntT>(raft::device_resources const& handle, \
-                                                      std::vector<T*>& input,               \
-                                                      std::vector<IntT>& sizes,             \
-                                                      IntT D,                               \
-                                                      T* search_items,                      \
-                                                      IntT n,                               \
-                                                      IdxT* res_I,                          \
-                                                      T* res_D,                             \
-                                                      IntT k,                               \
-                                                      bool rowMajorIndex,                   \
-                                                      bool rowMajorQuery,                   \
-                                                      std::vector<IdxT>* translations,      \
-                                                      distance::DistanceType metric,        \
-                                                      float metric_arg);
-
-RAFT_INST(long, float, int);
-RAFT_INST(long, float, unsigned int);
-RAFT_INST(uint32_t, float, int);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-};  // namespace raft::spatial::knn
 
 // also define the detail api, which is used by raft::neighbors::brute_force
 // (not doing the public api, since has extra template params on index_layout, matrix_index,
@@ -61,7 +36,8 @@ namespace raft::neighbors::detail {
                                                            bool rowMajorQuery,                   \
                                                            std::vector<IdxT>* translations,      \
                                                            raft::distance::DistanceType metric,  \
-                                                           float metricArg);
+                                                           float metricArg,                      \
+                                                           raft::identity_op);
 RAFT_INST(long, float, int);
 RAFT_INST(long, float, unsigned int);
 RAFT_INST(uint32_t, float, int);
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
index 02e1cbebb0..161f3462c9 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
@@ -20,35 +20,58 @@
 
 namespace raft::neighbors::ivf_flat {
 
-#define RAFT_INST(T, IdxT)                                                                   \
-  extern template auto build(raft::device_resources const& handle,                           \
-                             const index_params& params,                                     \
-                             raft::device_matrix_view<const T, uint64_t, row_major> dataset) \
-    ->index<T, IdxT>;                                                                        \
-                                                                                             \
-  extern template auto extend(                                                               \
-    raft::device_resources const& handle,                                                    \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                          \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,                   \
-    const index<T, IdxT>& orig_index)                                                        \
-    ->index<T, IdxT>;                                                                        \
-                                                                                             \
-  extern template void extend(                                                               \
-    raft::device_resources const& handle,                                                    \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                          \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,                   \
-    raft::neighbors::ivf_flat::index<T, IdxT>* idx);                                         \
-                                                                                             \
-  extern template void search(raft::device_resources const&,                                 \
-                              raft::neighbors::ivf_flat::search_params const&,               \
-                              const raft::neighbors::ivf_flat::index<T, IdxT>&,              \
-                              raft::device_matrix_view<const T, IdxT, row_major>,            \
-                              raft::device_matrix_view<IdxT, IdxT, row_major>,               \
-                              raft::device_matrix_view<float, IdxT, row_major>);
+// greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
+// function is used in both raft::neighbors::ivf_flat::search and
+// raft::neighbors::detail::refine_device. To prevent a duplicate instantiation
+// of this function (which defines ~270 kernels) in the refine specializations,
+// an extern template definition is provided here. Please check related function
+// calls after editing template definition below. Search for
+// `greppable-id-specializations-ivf-flat-search` to find them.
+#define RAFT_INST(T, IdxT)                                                               \
+  extern template auto build(raft::device_resources const& handle,                       \
+                             const index_params& params,                                 \
+                             raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->index<T, IdxT>;                                                                    \
+                                                                                         \
+  extern template auto extend(                                                           \
+    raft::device_resources const& handle,                                                \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
+    const index<T, IdxT>& orig_index)                                                    \
+    ->index<T, IdxT>;                                                                    \
+                                                                                         \
+  extern template void extend(                                                           \
+    raft::device_resources const& handle,                                                \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
+    raft::neighbors::ivf_flat::index<T, IdxT>* idx);                                     \
+                                                                                         \
+  extern template void search(raft::device_resources const&,                             \
+                              raft::neighbors::ivf_flat::search_params const&,           \
+                              const raft::neighbors::ivf_flat::index<T, IdxT>&,          \
+                              raft::device_matrix_view<const T, IdxT, row_major>,        \
+                              raft::device_matrix_view<IdxT, IdxT, row_major>,           \
+                              raft::device_matrix_view<float, IdxT, row_major>);         \
+                                                                                         \
+  extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<      \
+    T,                                                                                   \
+    typename raft::spatial::knn::detail::utils::config<T>::value_t,                      \
+    IdxT>(const index<T, IdxT>& index,                                                   \
+          const T* queries,                                                              \
+          const uint32_t* coarse_query_results,                                          \
+          const uint32_t n_queries,                                                      \
+          const raft::distance::DistanceType metric,                                     \
+          const uint32_t n_probes,                                                       \
+          const uint32_t k,                                                              \
+          const bool select_min,                                                         \
+          IdxT* neighbors,                                                               \
+          float* distances,                                                              \
+          uint32_t& grid_dim_x,                                                          \
+          rmm::cuda_stream_view stream);
 
-RAFT_INST(float, uint64_t);
-RAFT_INST(int8_t, uint64_t);
-RAFT_INST(uint8_t, uint64_t);
+RAFT_INST(float, int64_t);
+RAFT_INST(int8_t, int64_t);
+RAFT_INST(uint8_t, int64_t);
 
 #undef RAFT_INST
 }  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index 440e6901c6..63f27e6346 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -113,7 +113,7 @@ inline void initialReduction(raft::device_resources const& handle,
   kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
   kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_costs,
     d_vertices_dev.row_duals,
@@ -121,7 +121,7 @@ inline void initialReduction(raft::device_resources const& handle,
     SP,
     N,
     std::numeric_limits<weight_t>::max());
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
@@ -159,7 +159,7 @@ inline void computeInitialAssignments(raft::device_resources const& handle,
     SP,
     N,
     epsilon);
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
@@ -191,7 +191,7 @@ inline int computeRowCovers(raft::device_resources const& handle,
   kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
 }
@@ -268,7 +268,7 @@ inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
                                          0,
                                          handle.get_stream()>>>(
       predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
     thrust::exclusive_scan(
@@ -286,7 +286,7 @@ inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
         SP,
         N);
 
-      CHECK_CUDA(handle.get_stream());
+      RAFT_CHECK_CUDA(handle.get_stream());
     }
   }
 
@@ -356,7 +356,7 @@ inline void reversePass(raft::device_resources const& handle,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
   std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
@@ -375,11 +375,11 @@ inline void reversePass(raft::device_resources const& handle,
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
   }
 }
 
@@ -410,7 +410,7 @@ inline void augmentationPass(raft::device_resources const& handle,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
   // TODO: should be vertex_t
@@ -432,7 +432,7 @@ inline void augmentationPass(raft::device_resources const& handle,
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       d_vertices_dev.row_assignments,
@@ -443,7 +443,7 @@ inline void augmentationPass(raft::device_resources const& handle,
       vertex_t{N},
       row_ids_csr_size);
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
   }
 }
 
@@ -471,7 +471,7 @@ inline void dualUpdate(raft::device_resources const& handle,
     N,
     std::numeric_limits<weight_t>::max());
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP);
   kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
@@ -488,7 +488,7 @@ inline void dualUpdate(raft::device_resources const& handle,
     std::numeric_limits<weight_t>::max(),
     epsilon);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for calculating optimal objective function value using dual variables.
@@ -508,7 +508,7 @@ inline void calcObjValDual(raft::device_resources const& handle,
   kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for calculating optimal objective function value using dual variables.
@@ -529,7 +529,7 @@ inline void calcObjValPrimal(raft::device_resources const& handle,
   kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_costs, d_row_assignments, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 }  // namespace raft::solver::detail
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 7904c04ede..6e66bafe1f 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -170,7 +170,7 @@ class LinearAssignmentProblem {
   {
     weight_t result;
     raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
+    RAFT_CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
@@ -183,7 +183,7 @@ class LinearAssignmentProblem {
   {
     weight_t result;
     raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
+    RAFT_CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 63bc98b404..67d6f6c412 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -962,7 +962,7 @@ int computeSmallestEigenvectors(
                                 (*effIter) * nEigVecs * sizeof(value_type_t),
                                 cudaMemcpyHostToDevice,
                                 stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
@@ -1312,7 +1312,7 @@ int computeLargestEigenvectors(
                                 cudaMemcpyHostToDevice,
                                 stream));
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 99d688e232..c8fc6eefda 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -185,7 +185,6 @@ void k_closest_landmarks(raft::device_resources const& handle,
     make_device_matrix_view(query_pts, n_query_pts, inputs[0].extent(1)),
     make_device_matrix_view(R_knn_inds, n_query_pts, k),
     make_device_matrix_view(R_knn_dists, n_query_pts, k),
-    k,
     index.get_metric());
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 4e18a210d4..4a571c1447 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -22,6 +22,8 @@
 #include "processing.cuh"
 #include <raft/core/operators.hpp>
 #include <raft/distance/detail/distance.cuh>
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>
+#include <raft/distance/detail/distance_ops/l2_unexp.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/util/cuda_utils.cuh>
 
@@ -183,13 +185,11 @@ DI void updateSortedWarpQ(
   }
 }
 
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
+template <typename DataT,
           typename OutT,
           typename IdxT,
           typename Policy,
-          typename CoreLambda,
+          typename OpT,
           typename FinalLambda,
           int NumWarpQ,
           int NumThreadQ,
@@ -205,14 +205,14 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                                                                   const IdxT lda,
                                                                   const IdxT ldb,
                                                                   const IdxT ldd,
-                                                                  CoreLambda core_op,
+                                                                  OpT distance_op,
                                                                   FinalLambda fin_op,
-                                                                  bool sqrt,
                                                                   unsigned int numOfNN,
                                                                   volatile int* mutexes,
                                                                   volatile OutT* out_dists,
                                                                   volatile IdxT* out_inds)
 {
+  using AccT = typename OpT::AccT;
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
@@ -222,295 +222,279 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
   using namespace raft::neighbors::detail::faiss_select;
   typedef WarpSelect<AccT, uint32_t, Dir, Comparator<AccT>, NumWarpQ, NumThreadQ, 32> myWarpSelect;
 
-  auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__(
-                            IdxT gridStrideY) {
-    if (gridDim.x == 1) { return; }
-
-    Pair* shDumpKV = nullptr;
-    if (useNorms) {
-      shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
-    } else {
-      shDumpKV = (Pair*)(&smem[Policy::SmemSize]);
-    }
-
-    const int lid     = threadIdx.x % warpSize;
-    const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
-
-    //  0 -> consumer done consuming the buffer.
-    // -1 -> consumer started consuming the buffer
-    // -2 -> producer done filling the buffer
-    //  1 -> prod acquired to fill the buffer
-    if (blockIdx.x == 0) {
-      auto cta_processed = 0;
-      myWarpSelect heapArr1(identity, keyMax, numOfNN);
-      myWarpSelect heapArr2(identity, keyMax, numOfNN);
-      myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
-      __syncwarp();
-
-      loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-
-      while (cta_processed < gridDim.x - 1) {
-        if (threadIdx.x == 0) {
-          while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], -2, -1) != -2)
-            ;
-        }
-        __threadfence();
-        __syncthreads();
+  auto rowEpilog_lambda =
+    [m, n, &distance_op, numOfNN, out_dists, out_inds, mutexes] __device__(IdxT gridStrideY) {
+      if (gridDim.x == 1) { return; }
+
+      // Use ::template to disambiguate (See:
+      // https://en.cppreference.com/w/cpp/language/dependent_name)
+      int smem_offset = OpT::template shared_mem_size<Policy>();
+      Pair* shDumpKV  = (Pair*)(&smem[smem_offset]);
+
+      const int lid     = threadIdx.x % warpSize;
+      const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
+
+      //  0 -> consumer done consuming the buffer.
+      // -1 -> consumer started consuming the buffer
+      // -2 -> producer done filling the buffer
+      //  1 -> prod acquired to fill the buffer
+      if (blockIdx.x == 0) {
+        auto cta_processed = 0;
+        myWarpSelect heapArr1(identity, keyMax, numOfNN);
+        myWarpSelect heapArr2(identity, keyMax, numOfNN);
+        myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
+        __syncwarp();
+
+        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+
+        while (cta_processed < gridDim.x - 1) {
+          if (threadIdx.x == 0) {
+            while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], -2, -1) != -2)
+              ;
+          }
+          __threadfence();
+          __syncthreads();
 
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          if (rowId < m) {
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto rowId = starty + i * Policy::AccThRows;
+            if (rowId < m) {
 #pragma unroll
-            for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-              Pair otherKV;
-              otherKV.value  = identity;
-              otherKV.key    = keyMax;
-              const auto idx = j * warpSize + lid;
-              if (idx < numOfNN) {
-                otherKV.value         = out_dists[rowId * numOfNN + idx];
-                otherKV.key           = (uint32_t)out_inds[rowId * numOfNN + idx];
-                const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-                shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+              for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+                Pair otherKV;
+                otherKV.value  = identity;
+                otherKV.key    = keyMax;
+                const auto idx = j * warpSize + lid;
+                if (idx < numOfNN) {
+                  otherKV.value         = out_dists[rowId * numOfNN + idx];
+                  otherKV.key           = (uint32_t)out_inds[rowId * numOfNN + idx];
+                  const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                  shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+                }
               }
             }
           }
-        }
-        __threadfence();
-        __syncthreads();
+          __threadfence();
+          __syncthreads();
 
-        if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], 0); }
-        __threadfence();
+          if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], 0); }
+          __threadfence();
 
         // Perform merging of otherKV with topk's across warp.
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          if (rowId < m) {
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto rowId = starty + i * Policy::AccThRows;
+            if (rowId < m) {
 #pragma unroll
-            for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-              Pair otherKV;
-              otherKV.value  = identity;
-              otherKV.key    = keyMax;
-              const auto idx = j * warpSize + lid;
-              if (idx < numOfNN) {
-                const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-                otherKV               = shDumpKV[shMemRowId * numOfNN + idx];
+              for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+                Pair otherKV;
+                otherKV.value  = identity;
+                otherKV.key    = keyMax;
+                const auto idx = j * warpSize + lid;
+                if (idx < numOfNN) {
+                  const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                  otherKV               = shDumpKV[shMemRowId * numOfNN + idx];
+                }
+                heapArr[i]->add(otherKV.value, otherKV.key);
               }
-              heapArr[i]->add(otherKV.value, otherKV.key);
             }
           }
+          cta_processed++;
         }
-        cta_processed++;
-      }
 #pragma unroll
-      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId = starty + i * Policy::AccThRows;
-        if (rowId < m) {
-          bool needSort = (heapArr[i]->numVals > 0);
-          needSort      = __any_sync(0xffffffff, needSort);
-          if (needSort) { heapArr[i]->reduce(); }
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
+            bool needSort = (heapArr[i]->numVals > 0);
+            needSort      = __any_sync(0xffffffff, needSort);
+            if (needSort) { heapArr[i]->reduce(); }
+          }
         }
-      }
-      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
-    } else {
-      if (threadIdx.x == 0) {
-        while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], 0, 1) != 0)
-          ;
-      }
-      __threadfence();
-      __syncthreads();
+        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+      } else {
+        if (threadIdx.x == 0) {
+          while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], 0, 1) != 0)
+            ;
+        }
+        __threadfence();
+        __syncthreads();
 
 #pragma unroll
-      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId = starty + i * Policy::AccThRows;
-        if (rowId < m) {
-          for (int idx = lid; idx < numOfNN; idx += warpSize) {
-            const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-            Pair KVPair           = shDumpKV[shMemRowId * numOfNN + idx];
-            out_dists[rowId * numOfNN + idx] = KVPair.value;
-            out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
+            for (int idx = lid; idx < numOfNN; idx += warpSize) {
+              const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              Pair KVPair           = shDumpKV[shMemRowId * numOfNN + idx];
+              out_dists[rowId * numOfNN + idx] = KVPair.value;
+              out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
+            }
           }
         }
-      }
-      __threadfence();
-      __syncthreads();
-
-      if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], -2); }
-      __threadfence();
-    }
-  };
+        __threadfence();
+        __syncthreads();
 
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds, keyMax, identity] __device__(
-                         AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                         DataT * regxn,
-                         DataT * regyn,
-                         IdxT gridStrideX,
-                         IdxT gridStrideY) {
-    if (useNorms) {
-#pragma unroll
-      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-          acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
-        }
+        if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], -2); }
+        __threadfence();
       }
-    }
+    };
 
-    Pair* shDumpKV = nullptr;
-    if (useNorms) {
-      constexpr size_t shmemSize =
-        Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
-      shDumpKV = (Pair*)(&smem[shmemSize]);
-    } else {
-      shDumpKV = (Pair*)(&smem[Policy::SmemSize]);
-    }
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda =
+    [&distance_op, numOfNN, m, n, ldd, out_dists, out_inds, keyMax, identity] __device__(
+      AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+      DataT * regxn,
+      DataT * regyn,
+      IdxT gridStrideX,
+      IdxT gridStrideY) {
+      // Use ::template to disambiguate (See:
+      // https://en.cppreference.com/w/cpp/language/dependent_name)
+      int smem_offset = OpT::template shared_mem_size<Policy>();
+      Pair* shDumpKV  = (Pair*)(&smem[smem_offset]);
+
+      constexpr uint32_t mask = 0xffffffffu;
+      const IdxT starty       = gridStrideY + (threadIdx.x / Policy::AccThCols);
+      const IdxT startx       = gridStrideX + (threadIdx.x % Policy::AccThCols);
+      const int lid           = raft::laneId();
 
-    constexpr uint32_t mask = 0xffffffffu;
-    const IdxT starty       = gridStrideY + (threadIdx.x / Policy::AccThCols);
-    const IdxT startx       = gridStrideX + (threadIdx.x % Policy::AccThCols);
-    const int lid           = raft::laneId();
-
-    myWarpSelect heapArr1(identity, keyMax, numOfNN);
-    myWarpSelect heapArr2(identity, keyMax, numOfNN);
-    myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
-    if (usePrevTopKs) {
-      if (gridStrideX == blockIdx.x * Policy::Nblk) {
-        loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+      myWarpSelect heapArr1(identity, keyMax, numOfNN);
+      myWarpSelect heapArr2(identity, keyMax, numOfNN);
+      myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
+      if (usePrevTopKs) {
+        if (gridStrideX == blockIdx.x * Policy::Nblk) {
+          loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+        }
       }
-    }
 
-    if (gridStrideX > blockIdx.x * Policy::Nblk) {
+      if (gridStrideX > blockIdx.x * Policy::Nblk) {
 #pragma unroll
-      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-        Pair tempKV          = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
-        heapArr[i]->warpKTop = tempKV.value;
-      }
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          Pair tempKV          = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
+          heapArr[i]->warpKTop = tempKV.value;
+        }
 
-      // total vals can atmost be 256, (32*8)
-      int numValsWarpTopK[Policy::AccRowsPerTh];
-      int anyWarpTopKs = 0;
+        // total vals can atmost be 256, (32*8)
+        int numValsWarpTopK[Policy::AccRowsPerTh];
+        int anyWarpTopKs = 0;
 #pragma unroll
-      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId   = starty + i * Policy::AccThRows;
-        numValsWarpTopK[i] = 0;
-        if (rowId < m) {
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId   = starty + i * Policy::AccThRows;
+          numValsWarpTopK[i] = 0;
+          if (rowId < m) {
 #pragma unroll
-          for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-            const auto colId = startx + j * Policy::AccThCols;
-            if (colId < ldd) {
-              if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; }
+            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+              const auto colId = startx + j * Policy::AccThCols;
+              if (colId < ldd) {
+                if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; }
+              }
             }
+            anyWarpTopKs += numValsWarpTopK[i];
           }
-          anyWarpTopKs += numValsWarpTopK[i];
         }
-      }
-      anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
-      if (anyWarpTopKs) {
-        Pair* allWarpTopKs = (Pair*)(&smem[0]);
-        uint32_t needScanSort[Policy::AccRowsPerTh];
+        anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
+        if (anyWarpTopKs) {
+          Pair* allWarpTopKs = (Pair*)(&smem[0]);
+          uint32_t needScanSort[Policy::AccRowsPerTh];
 
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto gmemRowId = starty + i * Policy::AccThRows;
-          needScanSort[i]      = 0;
-          if (gmemRowId < m) {
-            int myVals      = numValsWarpTopK[i];
-            needScanSort[i] = __ballot_sync(mask, myVals > 0);
-            if (needScanSort[i]) {
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto gmemRowId = starty + i * Policy::AccThRows;
+            needScanSort[i]      = 0;
+            if (gmemRowId < m) {
+              int myVals      = numValsWarpTopK[i];
+              needScanSort[i] = __ballot_sync(mask, myVals > 0);
+              if (needScanSort[i]) {
 #pragma unroll
-              for (unsigned int k = 1; k <= 16; k *= 2) {
-                const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k);
-                if (lid >= k) { numValsWarpTopK[i] += n; }
+                for (unsigned int k = 1; k <= 16; k *= 2) {
+                  const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k);
+                  if (lid >= k) { numValsWarpTopK[i] += n; }
+                }
               }
+              // As each thread will know its total vals to write.
+              // we only store its starting location.
+              numValsWarpTopK[i] -= myVals;
             }
-            // As each thread will know its total vals to write.
-            // we only store its starting location.
-            numValsWarpTopK[i] -= myVals;
-          }
 
-          if (needScanSort[i]) {
-            const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-            if (gmemRowId < m) {
-              if (needScanSort[i] & ((uint32_t)1 << lid)) {
+            if (needScanSort[i]) {
+              const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              if (gmemRowId < m) {
+                if (needScanSort[i] & ((uint32_t)1 << lid)) {
 #pragma unroll
-                for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-                  const auto colId = startx + j * Policy::AccThCols;
-                  if (colId < ldd) {
-                    if (acc[i][j] < heapArr[i]->warpKTop) {
-                      Pair otherKV                                     = {colId, acc[i][j]};
-                      allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV;
-                      numValsWarpTopK[i]++;
+                  for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+                    const auto colId = startx + j * Policy::AccThCols;
+                    if (colId < ldd) {
+                      if (acc[i][j] < heapArr[i]->warpKTop) {
+                        Pair otherKV                                     = {colId, acc[i][j]};
+                        allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV;
+                        numValsWarpTopK[i]++;
+                      }
                     }
                   }
                 }
+                __syncwarp();
+                const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
+                loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
+                updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(
+                  heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
               }
-              __syncwarp();
-              const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
-              loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
-              updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(
-                heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
             }
           }
-        }
-        __syncthreads();
+          __syncthreads();
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          if (needScanSort[i]) {
-            const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-            const auto gmemRowId = starty + i * Policy::AccThRows;
-            if (gmemRowId < m) {
-              storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId, numOfNN);
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            if (needScanSort[i]) {
+              const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              const auto gmemRowId = starty + i * Policy::AccThRows;
+              if (gmemRowId < m) {
+                storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId, numOfNN);
+              }
             }
           }
         }
-      }
-    } else {
+      } else {
 #pragma unroll
-      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto gmemRowId  = starty + i * Policy::AccThRows;
-        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-        if (gmemRowId < m) {
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto gmemRowId  = starty + i * Policy::AccThRows;
+          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          if (gmemRowId < m) {
 #pragma unroll
-          for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-            const auto colId = startx + j * Policy::AccThCols;
-            Pair otherKV     = {keyMax, identity};
-            if (colId < ldd) {
-              otherKV.value = acc[i][j];
-              otherKV.key   = colId;
+            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+              const auto colId = startx + j * Policy::AccThCols;
+              Pair otherKV     = {keyMax, identity};
+              if (colId < ldd) {
+                otherKV.value = acc[i][j];
+                otherKV.key   = colId;
+              }
+              heapArr[i]->add(otherKV.value, otherKV.key);
             }
-            heapArr[i]->add(otherKV.value, otherKV.key);
-          }
 
-          bool needSort = (heapArr[i]->numVals > 0);
-          needSort      = __any_sync(mask, needSort);
-          if (needSort) { heapArr[i]->reduce(); }
-          storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId, numOfNN);
+            bool needSort = (heapArr[i]->numVals > 0);
+            needSort      = __any_sync(mask, needSort);
+            if (needSort) { heapArr[i]->reduce(); }
+            storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId, numOfNN);
+          }
         }
       }
-    }
 
-    if (((gridStrideX + Policy::Nblk * gridDim.x) >= n) && gridDim.x == 1) {
-      // This is last iteration of grid stride X
-      loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
-    }
-  };
+      if (((gridStrideX + Policy::Nblk * gridDim.x) >= n) && gridDim.x == 1) {
+        // This is last iteration of grid stride X
+        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+      }
+    };
 
-  raft::distance::detail::PairwiseDistances<useNorms,
-                                            DataT,
-                                            AccT,
+  constexpr bool write_out = false;
+  raft::distance::detail::PairwiseDistances<DataT,
                                             OutT,
                                             IdxT,
                                             Policy,
-                                            CoreLambda,
+                                            OpT,
                                             decltype(epilog_lambda),
                                             FinalLambda,
                                             decltype(rowEpilog_lambda),
                                             isRowMajor,
-                                            false>
+                                            write_out>
     obj(x,
         y,
         m,
@@ -521,9 +505,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
         ldd,
         _xn,
         _yn,
-        nullptr,
+        nullptr,  // output ptr, can be null as write_out == false.
         smem,
-        core_op,
+        distance_op,
         epilog_lambda,
         fin_op,
         rowEpilog_lambda);
@@ -562,38 +546,32 @@ void fusedL2UnexpKnnImpl(const DataT* x,
 
   dim3 blk(KPolicy::Nthreads);
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = x - y;
-    acc += diff * diff;
-  };
-
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
-  if (isRowMajor) {
-    constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN<false,
-                                                          DataT,
-                                                          AccT,
+  raft::distance::detail::ops::l2_unexp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
+  raft::identity_op fin_op{};
+
+  if constexpr (isRowMajor) {
+    constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN<DataT,
                                                           OutT,
                                                           IdxT,
                                                           KPolicy,
-                                                          decltype(core_lambda),
-                                                          raft::identity_op,
+                                                          decltype(distance_op),
+                                                          decltype(fin_op),
                                                           32,
                                                           2,
                                                           usePrevTopKs,
-                                                          true>;
-    constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN<false,
-                                                          DataT,
-                                                          AccT,
+                                                          isRowMajor>;
+    constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN<DataT,
                                                           OutT,
                                                           IdxT,
                                                           KPolicy,
-                                                          decltype(core_lambda),
-                                                          raft::identity_op,
+                                                          decltype(distance_op),
+                                                          decltype(fin_op),
                                                           64,
                                                           3,
                                                           usePrevTopKs,
-                                                          true>;
+                                                          isRowMajor>;
 
     auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
     if (numOfNN <= 32) {
@@ -604,8 +582,10 @@ void fusedL2UnexpKnnImpl(const DataT* x,
       ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    const auto sharedMemSize = KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
-    dim3 grid                = raft::distance::detail::launchConfigGenerator<KPolicy>(
+    const auto sharedMemSize =
+      distance_op.template shared_mem_size<KPolicy>() + KPolicy::Mblk * numOfNN * sizeof(Pair);
+
+    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, sharedMemSize, fusedL2UnexpKnnRowMajor);
 
     if (grid.x > 1) {
@@ -628,9 +608,8 @@ void fusedL2UnexpKnnImpl(const DataT* x,
                                                                   lda,
                                                                   ldb,
                                                                   ldd,
-                                                                  core_lambda,
-                                                                  raft::identity_op{},
-                                                                  sqrt,
+                                                                  distance_op,
+                                                                  fin_op,
                                                                   (uint32_t)numOfNN,
                                                                   (int*)workspace,
                                                                   out_dists,
@@ -753,36 +732,33 @@ void fusedL2ExpKnnImpl(const DataT* x,
   ASSERT(workspace != nullptr, "workspace is null");
 
   dim3 blk(KPolicy::Nthreads);
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
-  if (isRowMajor) {
-    constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN<true,
-                                                        DataT,
-                                                        AccT,
+  raft::distance::detail::ops::l2_exp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
+  raft::identity_op fin_op{};
+
+  if constexpr (isRowMajor) {
+    constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN<DataT,
                                                         OutT,
                                                         IdxT,
                                                         KPolicy,
-                                                        decltype(core_lambda),
-                                                        raft::identity_op,
+                                                        decltype(distance_op),
+                                                        decltype(fin_op),
                                                         32,
                                                         2,
                                                         usePrevTopKs,
-                                                        true>;
-    constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN<true,
-                                                        DataT,
-                                                        AccT,
+                                                        isRowMajor>;
+    constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN<DataT,
                                                         OutT,
                                                         IdxT,
                                                         KPolicy,
-                                                        decltype(core_lambda),
-                                                        raft::identity_op,
+                                                        decltype(distance_op),
+                                                        decltype(fin_op),
                                                         64,
                                                         3,
                                                         usePrevTopKs,
-                                                        true>;
+                                                        isRowMajor>;
 
     auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
     if (numOfNN <= 32) {
@@ -793,9 +769,8 @@ void fusedL2ExpKnnImpl(const DataT* x,
       ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    const auto sharedMemSize = KPolicy::SmemSize +
-                               ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) +
-                               (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    const auto sharedMemSize =
+      distance_op.template shared_mem_size<KPolicy>() + (KPolicy::Mblk * numOfNN * sizeof(Pair));
     dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, sharedMemSize, fusedL2ExpKnnRowMajor);
     int32_t* mutexes = nullptr;
@@ -835,9 +810,8 @@ void fusedL2ExpKnnImpl(const DataT* x,
                                                                 lda,
                                                                 ldb,
                                                                 ldd,
-                                                                core_lambda,
-                                                                raft::identity_op{},
-                                                                sqrt,
+                                                                distance_op,
+                                                                fin_op,
                                                                 (uint32_t)numOfNN,
                                                                 mutexes,
                                                                 out_dists,
diff --git a/cpp/include/raft/spatial/knn/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
deleted file mode 100644
index 3cae417996..0000000000
--- a/cpp/include/raft/spatial/knn/faiss_mr.hpp
+++ /dev/null
@@ -1,640 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
-This code contains unnecessary code duplication. These could be deleted
-once the relevant changes would be made on the FAISS side. Indeed most of
-the logic in the below code is similar to FAISS's standard implementation
-and should thus be inherited instead of duplicated. This FAISS's issue
-once solved should allow the removal of the unnecessary duplicates
-in this file : https://github.com/facebookresearch/faiss/issues/2097
-*/
-
-#pragma once
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StackDeviceMemory.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <map>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-using namespace faiss::gpu;
-
-namespace {
-
-// How many streams per device we allocate by default (for multi-streaming)
-constexpr int kNumStreams = 2;
-
-// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
-
-// Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
-
-std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map)
-{
-  // Produce a sorted list of all outstanding allocations by type
-  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-
-  for (auto& entry : map) {
-    auto& a = entry.second;
-
-    auto it = stats.find(a.type);
-    if (it != stats.end()) {
-      stats[a.type].first++;
-      stats[a.type].second += a.size;
-    } else {
-      stats[a.type] = std::make_pair(1, a.size);
-    }
-  }
-
-  std::stringstream ss;
-  for (auto& entry : stats) {
-    ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first
-       << " allocations, " << entry.second.second << " bytes\n";
-  }
-
-  return ss.str();
-}
-
-}  // namespace
-
-/// RMM implementation of the GpuResources object that provides for a
-/// temporary memory manager
-class RmmGpuResourcesImpl : public GpuResources {
- public:
-  RmmGpuResourcesImpl()
-    : pinnedMemAlloc_(nullptr),
-      pinnedMemAllocSize_(0),
-      // let the adjustment function determine the memory size for us by passing
-      // in a huge value that will then be adjusted
-      tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
-      pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-      allocLogging_(false),
-      cmr(new rmm::mr::cuda_memory_resource),
-      mmr(new rmm::mr::managed_memory_resource),
-      pmr(new rmm::mr::pinned_memory_resource){};
-
-  ~RmmGpuResourcesImpl()
-  {
-    // The temporary memory allocator has allocated memory through us, so clean
-    // that up before we finish fully de-initializing ourselves
-    tempMemory_.clear();
-
-    // Make sure all allocations have been freed
-    bool allocError = false;
-
-    for (auto& entry : allocs_) {
-      auto& map = entry.second;
-
-      if (!map.empty()) {
-        std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n"
-                  << "Device " << entry.first << " outstanding allocations:\n";
-        std::cerr << allocsToString(map);
-        allocError = true;
-      }
-    }
-
-    FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
-
-    for (auto& entry : defaultStreams_) {
-      DeviceScope scope(entry.first);
-
-      // We created these streams, so are responsible for destroying them
-      CUDA_VERIFY(cudaStreamDestroy(entry.second));
-    }
-
-    for (auto& entry : alternateStreams_) {
-      DeviceScope scope(entry.first);
-
-      for (auto stream : entry.second) {
-        CUDA_VERIFY(cudaStreamDestroy(stream));
-      }
-    }
-
-    for (auto& entry : asyncCopyStreams_) {
-      DeviceScope scope(entry.first);
-
-      CUDA_VERIFY(cudaStreamDestroy(entry.second));
-    }
-
-    for (auto& entry : blasHandles_) {
-      DeviceScope scope(entry.first);
-
-      auto blasStatus = cublasDestroy(entry.second);
-      FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-    }
-
-    if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); }
-  };
-
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory() { setTempMemory(0); };
-
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size)
-  {
-    if (tempMemSize_ != size) {
-      // adjust based on general limits
-      tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-
-      // We need to re-initialize memory resources for all current devices that
-      // have been initialized.
-      // This should be safe to do, even if we are currently running work, because
-      // the cudaFree call that this implies will force-synchronize all GPUs with
-      // the CPU
-      for (auto& p : tempMemory_) {
-        int device = p.first;
-        // Free the existing memory first
-        p.second.reset();
-
-        // Allocate new
-        p.second = std::unique_ptr<StackDeviceMemory>(
-          new StackDeviceMemory(this,
-                                p.first,
-                                // adjust for this specific device
-                                getDefaultTempMemForGPU(device, tempMemSize_)));
-      }
-    }
-  };
-
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size)
-  {
-    // Should not call this after devices have been initialized
-    FAISS_ASSERT(defaultStreams_.size() == 0);
-    FAISS_ASSERT(!pinnedMemAlloc_);
-
-    pinnedMemSize_ = size;
-  };
-
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream)
-  {
-    if (isInitialized(device)) {
-      // A new series of calls may not be ordered with what was the previous
-      // stream, so if the stream being specified is different, then we need to
-      // ensure ordering between the two (new stream waits on old).
-      auto it                 = userDefaultStreams_.find(device);
-      cudaStream_t prevStream = nullptr;
-
-      if (it != userDefaultStreams_.end()) {
-        prevStream = it->second;
-      } else {
-        FAISS_ASSERT(defaultStreams_.count(device));
-        prevStream = defaultStreams_[device];
-      }
-
-      if (prevStream != stream) { streamWait({stream}, {prevStream}); }
-    }
-
-    userDefaultStreams_[device] = stream;
-  };
-
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device)
-  {
-    if (isInitialized(device)) {
-      auto it = userDefaultStreams_.find(device);
-
-      if (it != userDefaultStreams_.end()) {
-        // There was a user stream set that we need to synchronize against
-        cudaStream_t prevStream = userDefaultStreams_[device];
-
-        FAISS_ASSERT(defaultStreams_.count(device));
-        cudaStream_t newStream = defaultStreams_[device];
-
-        streamWait({newStream}, {prevStream});
-      }
-    }
-
-    userDefaultStreams_.erase(device);
-  };
-
-  /// Returns the stream for the given device on which all Faiss GPU work is
-  /// ordered.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  cudaStream_t getDefaultStream(int device)
-  {
-    initializeForDevice(device);
-
-    auto it = userDefaultStreams_.find(device);
-    if (it != userDefaultStreams_.end()) {
-      // There is a user override stream set
-      return it->second;
-    }
-
-    // Otherwise, our base default stream
-    return defaultStreams_[device];
-  };
-
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices()
-  {
-    for (int dev = 0; dev < getNumDevices(); ++dev) {
-      setDefaultStream(dev, nullptr);
-    }
-  };
-
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
-
- public:
-  /// Internal system calls
-
-  /// Initialize resources for this device
-  void initializeForDevice(int device)
-  {
-    if (isInitialized(device)) { return; }
-
-    // If this is the first device that we're initializing, create our
-    // pinned memory allocation
-    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-      pinnedMemAlloc_     = pmr->allocate(pinnedMemSize_);
-      pinnedMemAllocSize_ = pinnedMemSize_;
-    }
-
-    FAISS_ASSERT(device < getNumDevices());
-    DeviceScope scope(device);
-
-    // Make sure that device properties for all devices are cached
-    auto& prop = getDeviceProperties(device);
-
-    // Also check to make sure we meet our minimum compute capability (3.0)
-    FAISS_ASSERT_FMT(prop.major >= 3,
-                     "Device id %d with CC %d.%d not supported, "
-                     "need 3.0+ compute capability",
-                     device,
-                     prop.major,
-                     prop.minor);
-
-    // Create streams
-    cudaStream_t defaultStream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-
-    defaultStreams_[device] = defaultStream;
-
-    cudaStream_t asyncCopyStream = 0;
-    CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-
-    asyncCopyStreams_[device] = asyncCopyStream;
-
-    std::vector<cudaStream_t> deviceStreams;
-    for (int j = 0; j < kNumStreams; ++j) {
-      cudaStream_t stream = 0;
-      CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-      deviceStreams.push_back(stream);
-    }
-
-    alternateStreams_[device] = std::move(deviceStreams);
-
-    // Create cuBLAS handle
-    cublasHandle_t blasHandle = 0;
-    auto blasStatus           = cublasCreate(&blasHandle);
-    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-    blasHandles_[device] = blasHandle;
-
-    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-    // rounding down of inputs to f16 (though accumulate in f32) which results in
-    // unacceptable loss of precision in general.
-    // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
-    // a loss of precision.
-#if CUDA_VERSION >= 11000
-    cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-#endif
-
-    FAISS_ASSERT(allocs_.count(device) == 0);
-    allocs_[device] = std::unordered_map<void*, AllocRequest>();
-
-    FAISS_ASSERT(tempMemory_.count(device) == 0);
-    auto mem = std::unique_ptr<StackDeviceMemory>(
-      new StackDeviceMemory(this,
-                            device,
-                            // adjust for this specific device
-                            getDefaultTempMemForGPU(device, tempMemSize_)));
-
-    tempMemory_.emplace(device, std::move(mem));
-  };
-
-  cublasHandle_t getBlasHandle(int device)
-  {
-    initializeForDevice(device);
-    return blasHandles_[device];
-  };
-
-  std::vector<cudaStream_t> getAlternateStreams(int device)
-  {
-    initializeForDevice(device);
-    return alternateStreams_[device];
-  };
-
-  /// Allocate non-temporary GPU memory
-  void* allocMemory(const AllocRequest& req)
-  {
-    initializeForDevice(req.device);
-
-    // We don't allocate a placeholder for zero-sized allocations
-    if (req.size == 0) { return nullptr; }
-
-    // Make sure that the allocation is a multiple of 16 bytes for alignment
-    // purposes
-    auto adjReq = req;
-    adjReq.size = utils::roundUp(adjReq.size, (size_t)16);
-
-    void* p = nullptr;
-
-    if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; }
-
-    if (adjReq.space == MemorySpace::Temporary) {
-      // If we don't have enough space in our temporary memory manager, we need
-      // to allocate this request separately
-      auto& tempMem = tempMemory_[adjReq.device];
-
-      if (adjReq.size > tempMem->getSizeAvailable()) {
-        // We need to allocate this ourselves
-        AllocRequest newReq = adjReq;
-        newReq.space        = MemorySpace::Device;
-        newReq.type         = AllocType::TemporaryMemoryOverflow;
-
-        return allocMemory(newReq);
-      }
-
-      // Otherwise, we can handle this locally
-      p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-
-    } else if (adjReq.space == MemorySpace::Device) {
-      p = cmr->allocate(adjReq.size, adjReq.stream);
-    } else if (adjReq.space == MemorySpace::Unified) {
-      p = mmr->allocate(adjReq.size, adjReq.stream);
-    } else {
-      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
-    }
-
-    allocs_[adjReq.device][p] = adjReq;
-
-    return p;
-  };
-
-  /// Returns a previous allocation
-  void deallocMemory(int device, void* p)
-  {
-    FAISS_ASSERT(isInitialized(device));
-
-    if (!p) { return; }
-
-    auto& a = allocs_[device];
-    auto it = a.find(p);
-    FAISS_ASSERT(it != a.end());
-
-    auto& req = it->second;
-
-    if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; }
-
-    if (req.space == MemorySpace::Temporary) {
-      tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-    } else if (req.space == MemorySpace::Device) {
-      cmr->deallocate(p, req.size, req.stream);
-    } else if (req.space == MemorySpace::Unified) {
-      mmr->deallocate(p, req.size, req.stream);
-    } else {
-      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
-    }
-
-    a.erase(it);
-  };
-
-  size_t getTempMemoryAvailable(int device) const
-  {
-    FAISS_ASSERT(isInitialized(device));
-
-    auto it = tempMemory_.find(device);
-    FAISS_ASSERT(it != tempMemory_.end());
-
-    return it->second->getSizeAvailable();
-  };
-
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
-  {
-    using AT = std::map<std::string, std::pair<int, size_t>>;
-
-    std::map<int, AT> out;
-
-    for (auto& entry : allocs_) {
-      AT outDevice;
-
-      for (auto& a : entry.second) {
-        auto& v = outDevice[allocTypeToString(a.second.type)];
-        v.first++;
-        v.second += a.second.size;
-      }
-
-      out[entry.first] = std::move(outDevice);
-    }
-
-    return out;
-  };
-
-  std::pair<void*, size_t> getPinnedMemory()
-  {
-    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
-  };
-
-  cudaStream_t getAsyncCopyStream(int device)
-  {
-    initializeForDevice(device);
-    return asyncCopyStreams_[device];
-  };
-
- private:
-  /// Have GPU resources been initialized for this device yet?
-  bool isInitialized(int device) const
-  {
-    // Use default streams as a marker for whether or not a certain
-    // device has been initialized
-    return defaultStreams_.count(device) != 0;
-  };
-
-  /// Adjust the default temporary memory allocation based on the total GPU
-  /// memory size
-  static size_t getDefaultTempMemForGPU(int device, size_t requested)
-  {
-    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
-                                 : std::numeric_limits<size_t>::max();
-
-    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
-      // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-
-      if (requested > k4GiBTempMem) { return k4GiBTempMem; }
-    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
-      // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-
-      if (requested > k8GiBTempMem) { return k8GiBTempMem; }
-    } else {
-      // Never use more than 1.5 GiB
-      if (requested > kMaxTempMem) { return kMaxTempMem; }
-    }
-
-    // use whatever lower limit the user requested
-    return requested;
-  };
-
- private:
-  /// Set of currently outstanding memory allocations per device
-  /// device -> (alloc request, allocated ptr)
-  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
-
-  /// Temporary memory provider, per each device
-  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
-
-  /// Our default stream that work is ordered on, one per each device
-  std::unordered_map<int, cudaStream_t> defaultStreams_;
-
-  /// This contains particular streams as set by the user for
-  /// ordering, if any
-  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
-
-  /// Other streams we can use, per each device
-  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
-
-  /// Async copy stream to use for GPU <-> CPU pinned memory copies
-  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
-
-  /// cuBLAS handle for each device
-  std::unordered_map<int, cublasHandle_t> blasHandles_;
-
-  /// Pinned memory allocation for use with this GPU
-  void* pinnedMemAlloc_;
-  size_t pinnedMemAllocSize_;
-
-  /// Another option is to use a specified amount of memory on all
-  /// devices
-  size_t tempMemSize_;
-
-  /// Amount of pinned memory we should allocate
-  size_t pinnedMemSize_;
-
-  /// Whether or not we log every GPU memory allocation and deallocation
-  bool allocLogging_;
-
-  // cuda_memory_resource
-  std::unique_ptr<rmm::mr::device_memory_resource> cmr;
-
-  // managed_memory_resource
-  std::unique_ptr<rmm::mr::device_memory_resource> mmr;
-
-  // pinned_memory_resource
-  std::unique_ptr<rmm::mr::host_memory_resource> pmr;
-};
-
-/// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory.
-/// Internally, the Faiss GPU code uses the instance managed by getResources,
-/// but this is the user-facing object that is internally reference counted.
-class RmmGpuResources : public GpuResourcesProvider {
- public:
-  RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
-
-  ~RmmGpuResources(){};
-
-  std::shared_ptr<GpuResources> getResources() { return res_; };
-
-  /// Disable allocation of temporary memory; all temporary memory
-  /// requests will call cudaMalloc / cudaFree at the point of use
-  void noTempMemory() { res_->noTempMemory(); };
-
-  /// Specify that we wish to use a certain fixed size of memory on
-  /// all devices as temporary memory. This is the upper bound for the GPU
-  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-  /// To avoid any temporary memory allocation, pass 0.
-  void setTempMemory(size_t size) { res_->setTempMemory(size); };
-
-  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-  /// transfers
-  void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
-
-  /// Called to change the stream for work ordering. We do not own `stream`;
-  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-  /// up.
-  /// We are guaranteed that all Faiss GPU work is ordered with respect to
-  /// this stream upon exit from an index or other Faiss GPU call.
-  void setDefaultStream(int device, cudaStream_t stream)
-  {
-    res_->setDefaultStream(device, stream);
-  };
-
-  /// Revert the default stream to the original stream managed by this resources
-  /// object, in case someone called `setDefaultStream`.
-  void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
-
-  /// Called to change the work ordering streams to the null stream
-  /// for all devices
-  void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); };
-
-  /// Export a description of memory used for Python
-  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
-  {
-    return res_->getMemoryInfo();
-  };
-
-  /// Returns the current default stream
-  cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); };
-
-  /// Returns the current amount of temp memory available
-  size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); };
-
-  /// Synchronize our default stream with the CPU
-  void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); };
-
-  /// If enabled, will print every GPU memory allocation and deallocation to
-  /// standard output
-  void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); };
-
- private:
-  std::shared_ptr<RmmGpuResourcesImpl> res_;
-};
-
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 692d262043..a7bbfd9500 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -153,12 +153,12 @@ template <typename idx_t = int, typename value_t = float>
 
     case SelectKAlgo::RADIX_8_BITS:
       matrix::detail::select::radix::select_k<value_t, idx_t, 8, 512>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
       matrix::detail::select::radix::select_k<value_t, idx_t, 11, 512>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
index 34b7b742e9..5f0a39a61b 100644
--- a/cpp/include/raft/spatial/knn/specializations.cuh
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -14,13 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __KNN_SPECIALIZATIONS_H
-#define __KNN_SPECIALIZATIONS_H
-
 #pragma once
 
 #include <raft/neighbors/specializations/ball_cover.cuh>
+#include <raft/neighbors/specializations/brute_force.cuh>
 #include <raft/neighbors/specializations/fused_l2_knn.cuh>
-#include <raft/neighbors/specializations/knn.cuh>
-
-#endif
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
new file mode 100644
index 0000000000..e045487597
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/knn.cuh>
+
+namespace raft::spatial::knn {
+#define RAFT_INST(IdxT, T, IntT)                                                            \
+  extern template void brute_force_knn<IdxT, T, IntT>(raft::device_resources const& handle, \
+                                                      std::vector<T*>& input,               \
+                                                      std::vector<IntT>& sizes,             \
+                                                      IntT D,                               \
+                                                      T* search_items,                      \
+                                                      IntT n,                               \
+                                                      IdxT* res_I,                          \
+                                                      T* res_D,                             \
+                                                      IntT k,                               \
+                                                      bool rowMajorIndex,                   \
+                                                      bool rowMajorQuery,                   \
+                                                      std::vector<IdxT>* translations,      \
+                                                      distance::DistanceType metric,        \
+                                                      float metric_arg);
+
+RAFT_INST(long, float, int);
+RAFT_INST(long, float, unsigned int);
+RAFT_INST(uint32_t, float, int);
+RAFT_INST(uint32_t, float, unsigned int);
+#undef RAFT_INST
+};  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index e32b718117..73518e20ef 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -352,7 +352,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     // scales y by beta:
     //
     if (beta == 0) {
-      CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
+      RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
       // TODO: Call from public API when ready
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
diff --git a/cpp/include/raft/spectral/specializations.cuh b/cpp/include/raft/spectral/specializations.cuh
index 2303b426fd..0ce5f0c653 100644
--- a/cpp/include/raft/spectral/specializations.cuh
+++ b/cpp/include/raft/spectral/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,6 @@
 #pragma once
 
 #include <raft/distance/specializations.cuh>
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
index 660eee783f..e6622469d3 100644
--- a/cpp/include/raft/stats/specializations.cuh
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,6 @@
 #pragma once
 
 #include <raft/distance/specializations.cuh>
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/util/arch.cuh b/cpp/include/raft/util/arch.cuh
index 8c48b87269..dc35b10063 100644
--- a/cpp/include/raft/util/arch.cuh
+++ b/cpp/include/raft/util/arch.cuh
@@ -15,25 +15,27 @@
  */
 #pragma once
 
-namespace raft::arch {
+#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
 
-/* raft::arch provides the following facilities:
+namespace raft::util::arch {
+
+/* raft::util::arch provides the following facilities:
  *
- * - raft::arch::SM_XX : hardcoded compile-time constants for various compute
- *   architectures. The values raft::arch::SM_min and raft::arch::SM_future
+ * - raft::util::arch::SM_XX : hardcoded compile-time constants for various compute
+ *   architectures. The values raft::util::arch::SM_min and raft::util::arch::SM_future
  *   represent architectures that are always smaller and larger (respectively)
  *   than any architecture that can be encountered in practice.
  *
- * - raft::arch::SM_compute_arch : a compile-time value for the *current*
+ * - raft::util::arch::SM_compute_arch : a compile-time value for the *current*
  *   compute architecture that a kernel is compiled with. It can only be used
  *   inside kernels with a template argument.
  *
- * - raft::arch::kernel_runtime_arch : a function that computes at *run-time*
+ * - raft::util::arch::kernel_runtime_arch : a function that computes at *run-time*
  *   which version of a kernel will launch (i.e., it will return the compute
  *   architecture of the version of the kernel that will be launched by the
  *   driver).
  *
- * - raft::arch::SM_range : a compile-time value to represent an open interval
+ * - raft::util::arch::SM_range : a compile-time value to represent an open interval
  *   of compute architectures. This can be used to check if the current
  *   compile-time architecture is in a specified compatibility range.
  */
@@ -46,9 +48,6 @@ struct SM_generic {
  public:
   __host__ __device__ constexpr int value() const { return n; }
 };
-
-// A dummy kernel that is used to determine the runtime architecture.
-__global__ inline void dummy_runtime_kernel() {}
 }  // namespace detail
 
 // A list of architectures that RAPIDS explicitly builds for (SM60, ..., SM90)
@@ -119,7 +118,7 @@ struct SM_runtime {
 inline SM_runtime kernel_runtime_arch(void* kernel)
 {
   cudaFuncAttributes attributes;
-  cudaFuncGetAttributes(&attributes, kernel);
+  RAFT_CUDA_TRY(cudaFuncGetAttributes(&attributes, kernel));
 
   return SM_runtime(10 * attributes.ptxVersion);
 }
@@ -143,4 +142,4 @@ struct SM_range {
   }
 };
 
-}  // namespace raft::arch
+}  // namespace raft::util::arch
diff --git a/cpp/include/raft/util/cuda_dev_essentials.cuh b/cpp/include/raft/util/cuda_dev_essentials.cuh
new file mode 100644
index 0000000000..bb9ebbba59
--- /dev/null
+++ b/cpp/include/raft/util/cuda_dev_essentials.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// This file provides a few essential functions for use in __device__ code. The
+// scope is necessarily limited to ensure that compilation times are minimized.
+// Please make sure not to include large / expensive files from here.
+
+namespace raft {
+
+/** helper macro for device inlined functions */
+#define DI  inline __device__
+#define HDI inline __host__ __device__
+#define HD  __host__ __device__
+
+/**
+ * @brief Provide a ceiling division operation ie. ceil(a / b)
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr HDI IntType ceildiv(IntType a, IntType b)
+{
+  return (a + b - 1) / b;
+}
+
+/**
+ * @brief Provide an alignment function ie. ceil(a / b) * b
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr HDI IntType alignTo(IntType a, IntType b)
+{
+  return ceildiv(a, b) * b;
+}
+
+/**
+ * @brief Provide an alignment function ie. (a / b) * b
+ * @tparam IntType supposed to be only integers for now!
+ */
+template <typename IntType>
+constexpr HDI IntType alignDown(IntType a, IntType b)
+{
+  return (a / b) * b;
+}
+
+/**
+ * @brief Check if the input is a power of 2
+ * @tparam IntType data type (checked only for integers)
+ */
+template <typename IntType>
+constexpr HDI bool isPo2(IntType num)
+{
+  return (num && !(num & (num - 1)));
+}
+
+/**
+ * @brief Give logarithm of the number to base-2
+ * @tparam IntType data type (checked only for integers)
+ */
+template <typename IntType>
+constexpr HDI IntType log2(IntType num, IntType ret = IntType(0))
+{
+  return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
+}
+
+/** number of threads per warp */
+static const int WarpSize = 32;
+
+/** get the laneId of the current thread */
+DI int laneId()
+{
+  int id;
+  asm("mov.s32 %0, %%laneid;" : "=r"(id));
+  return id;
+}
+
+/** Device function to apply the input lambda across threads in the grid */
+template <int ItemsPerThread, typename L>
+DI void forEach(int num, L lambda)
+{
+  int idx              = (blockDim.x * blockIdx.x) + threadIdx.x;
+  const int numThreads = blockDim.x * gridDim.x;
+#pragma unroll
+  for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
+    if (idx < num) lambda(idx, itr);
+  }
+}
+
+/**
+ * @brief Swap two values
+ * @tparam T the datatype of the values
+ * @param a first input
+ * @param b second input
+ */
+template <typename T>
+HDI void swapVals(T& a, T& b)
+{
+  T tmp = a;
+  a     = b;
+  b     = tmp;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/util/cuda_rt_essentials.hpp b/cpp/include/raft/util/cuda_rt_essentials.hpp
new file mode 100644
index 0000000000..e5f3af4e61
--- /dev/null
+++ b/cpp/include/raft/util/cuda_rt_essentials.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// This file provides a few essential functions that wrap the CUDA runtime API.
+// The scope is necessarily limited to ensure that compilation times are
+// minimized. Please make sure not to include large / expensive files from here.
+
+#include <cuda_runtime.h>
+#include <raft/core/error.hpp>
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a CUDA error is encountered.
+ */
+struct cuda_error : public raft::exception {
+  explicit cuda_error(char const* const message) : raft::exception(message) {}
+  explicit cuda_error(std::string const& message) : raft::exception(message) {}
+};
+
+}  // namespace raft
+
+/**
+ * @brief Error checking macro for CUDA runtime API functions.
+ *
+ * Invokes a CUDA runtime API function call, if the call does not return
+ * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
+ * exception detailing the CUDA error that occurred
+ *
+ */
+#define RAFT_CUDA_TRY(call)                        \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
+  } while (0)
diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh
index 5be9dc999a..687a6b4651 100644
--- a/cpp/include/raft/util/cuda_utils.cuh
+++ b/cpp/include/raft/util/cuda_utils.cuh
@@ -23,113 +23,10 @@
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/math.hpp>
 #include <raft/core/operators.hpp>
-
-#ifndef ENABLE_MEMCPY_ASYNC
-// enable memcpy_async interface by default for newer GPUs
-#if __CUDA_ARCH__ >= 800
-#define ENABLE_MEMCPY_ASYNC 1
-#endif
-#else  // ENABLE_MEMCPY_ASYNC
-// disable memcpy_async for all older GPUs
-#if __CUDA_ARCH__ < 800
-#define ENABLE_MEMCPY_ASYNC 0
-#endif
-#endif  // ENABLE_MEMCPY_ASYNC
+#include <raft/util/cuda_dev_essentials.cuh>
 
 namespace raft {
 
-/** helper macro for device inlined functions */
-#define DI  inline __device__
-#define HDI inline __host__ __device__
-#define HD  __host__ __device__
-
-/**
- * @brief Provide a ceiling division operation ie. ceil(a / b)
- * @tparam IntType supposed to be only integers for now!
- */
-template <typename IntType>
-constexpr HDI IntType ceildiv(IntType a, IntType b)
-{
-  return (a + b - 1) / b;
-}
-
-/**
- * @brief Provide an alignment function ie. ceil(a / b) * b
- * @tparam IntType supposed to be only integers for now!
- */
-template <typename IntType>
-constexpr HDI IntType alignTo(IntType a, IntType b)
-{
-  return ceildiv(a, b) * b;
-}
-
-/**
- * @brief Provide an alignment function ie. (a / b) * b
- * @tparam IntType supposed to be only integers for now!
- */
-template <typename IntType>
-constexpr HDI IntType alignDown(IntType a, IntType b)
-{
-  return (a / b) * b;
-}
-
-/**
- * @brief Check if the input is a power of 2
- * @tparam IntType data type (checked only for integers)
- */
-template <typename IntType>
-constexpr HDI bool isPo2(IntType num)
-{
-  return (num && !(num & (num - 1)));
-}
-
-/**
- * @brief Give logarithm of the number to base-2
- * @tparam IntType data type (checked only for integers)
- */
-template <typename IntType>
-constexpr HDI IntType log2(IntType num, IntType ret = IntType(0))
-{
-  return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
-}
-
-/** Device function to apply the input lambda across threads in the grid */
-template <int ItemsPerThread, typename L>
-DI void forEach(int num, L lambda)
-{
-  int idx              = (blockDim.x * blockIdx.x) + threadIdx.x;
-  const int numThreads = blockDim.x * gridDim.x;
-#pragma unroll
-  for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
-    if (idx < num) lambda(idx, itr);
-  }
-}
-
-/** number of threads per warp */
-static const int WarpSize = 32;
-
-/** get the laneId of the current thread */
-DI int laneId()
-{
-  int id;
-  asm("mov.s32 %0, %%laneid;" : "=r"(id));
-  return id;
-}
-
-/**
- * @brief Swap two values
- * @tparam T the datatype of the values
- * @param a first input
- * @param b second input
- */
-template <typename T>
-HDI void swapVals(T& a, T& b)
-{
-  T tmp = a;
-  a     = b;
-  b     = tmp;
-}
-
 /** Device function to have atomic add support for older archs */
 template <typename Type>
 DI void myAtomicAdd(Type* address, Type val)
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 0feb188ad8..1134513587 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -14,24 +14,17 @@
  * limitations under the License.
  */
 
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cudart_utils.hpp instead.
- */
-
-#ifndef __RAFT_RT_CUDART_UTILS_H
-#define __RAFT_RT_CUDART_UTILS_H
-
 #pragma once
 
 #include <raft/core/error.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_fp16.h>
-#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
 
 #include <chrono>
 #include <cstdio>
@@ -40,47 +33,7 @@
 #include <iostream>
 #include <memory>
 #include <mutex>
-
-namespace raft {
-
-/**
- * @brief Exception thrown when a CUDA error is encountered.
- */
-struct cuda_error : public raft::exception {
-  explicit cuda_error(char const* const message) : raft::exception(message) {}
-  explicit cuda_error(std::string const& message) : raft::exception(message) {}
-};
-
-}  // namespace raft
-
-/**
- * @brief Error checking macro for CUDA runtime API functions.
- *
- * Invokes a CUDA runtime API function call, if the call does not return
- * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
- * exception detailing the CUDA error that occurred
- *
- */
-#define RAFT_CUDA_TRY(call)                        \
-  do {                                             \
-    cudaError_t const status = call;               \
-    if (status != cudaSuccess) {                   \
-      cudaGetLastError();                          \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "CUDA error encountered at: ", \
-                    "call='%s', Reason=%s:%s",     \
-                    #call,                         \
-                    cudaGetErrorName(status),      \
-                    cudaGetErrorString(status));   \
-      throw raft::cuda_error(msg);                 \
-    }                                              \
-  } while (0)
-
-// FIXME: Remove after consumers rename
-#ifndef CUDA_TRY
-#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
-#endif
+#include <string>
 
 /**
  * @brief Debug macro to check for CUDA errors
@@ -101,16 +54,6 @@ struct cuda_error : public raft::exception {
 #define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
 #endif
 
-// FIXME: Remove after consumers rename
-#ifndef CHECK_CUDA
-#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
-#endif
-
 // /**
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
@@ -127,17 +70,6 @@ struct cuda_error : public raft::exception {
     }                                                              \
   } while (0)
 
-// FIXME: Remove after cuml rename
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
-#endif
-
-/**
- * Alias to raft scope for now.
- * TODO: Rename original implementations in 22.04 to fix
- * https://github.com/rapidsai/raft/issues/128
- */
-
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
@@ -249,7 +181,7 @@ class grid_1d_block_t {
 template <typename Type>
 void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
 {
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -275,7 +207,8 @@ void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_vi
 template <typename Type>
 void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
 {
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -304,7 +237,7 @@ void print_device_vector(const char* variable_name,
                          OutStream& out)
 {
   auto host_mem = std::make_unique<T[]>(componentsCount);
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpy(host_mem.get(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem.get(), componentsCount, out);
 }
@@ -566,5 +499,3 @@ inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_
 }
 
 }  // namespace raft
-
-#endif
diff --git a/cpp/include/raft/util/device_loads_stores.cuh b/cpp/include/raft/util/device_loads_stores.cuh
index 2b87c44d60..c9bda26b81 100644
--- a/cpp/include/raft/util/device_loads_stores.cuh
+++ b/cpp/include/raft/util/device_loads_stores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
+#include <cstdint>                            // uintX_t
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
 
 namespace raft {
 
diff --git a/cpp/include/raft_runtime/neighbors/brute_force.hpp b/cpp/include/raft_runtime/neighbors/brute_force.hpp
new file mode 100644
index 0000000000..12da6ff101
--- /dev/null
+++ b/cpp/include/raft_runtime/neighbors/brute_force.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace raft::runtime::neighbors::brute_force {
+
+#define RAFT_INST_BFKNN(IDX_T, DATA_T, MATRIX_IDX_T, INDEX_LAYOUT, SEARCH_LAYOUT)        \
+  void knn(raft::device_resources const& handle,                                         \
+           raft::device_matrix_view<const DATA_T, MATRIX_IDX_T, INDEX_LAYOUT> index,     \
+           raft::device_matrix_view<const DATA_T, MATRIX_IDX_T, SEARCH_LAYOUT> search,   \
+           raft::device_matrix_view<IDX_T, MATRIX_IDX_T, row_major> indices,             \
+           raft::device_matrix_view<DATA_T, MATRIX_IDX_T, row_major> distances,          \
+           distance::DistanceType metric         = distance::DistanceType::L2Unexpanded, \
+           std::optional<float> metric_arg       = std::make_optional<float>(2.0f),      \
+           std::optional<IDX_T> global_id_offset = std::nullopt);
+
+RAFT_INST_BFKNN(int64_t, float, int64_t, raft::row_major, raft::row_major);
+
+#undef RAFT_INST_BFKNN
+
+}  // namespace raft::runtime::neighbors::brute_force
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
index 59cbff9dfb..188122c9b4 100644
--- a/cpp/internal/raft_internal/matrix/select_k.cuh
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -20,7 +20,7 @@
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/matrix/select_k.cuh>
 
-#ifdef RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/matrix/specializations.cuh>
 #endif
 
@@ -33,7 +33,8 @@ struct params {
   size_t len;
   int k;
   bool select_min;
-  bool use_index_input = true;
+  bool use_index_input       = true;
+  bool use_same_leading_bits = false;
 };
 
 inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
@@ -42,7 +43,8 @@ inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
   os << ", len: " << ss.len;
   os << ", k: " << ss.k;
   os << (ss.select_min ? ", asc" : ", dsc");
-  os << (ss.use_index_input ? "}" : ", no-input-index}");
+  os << (ss.use_index_input ? "" : ", no-input-index");
+  os << (ss.use_same_leading_bits ? ", same-leading-bits}" : "}");
   return os;
 }
 
@@ -50,6 +52,7 @@ enum class Algo {
   kPublicApi,
   kRadix8bits,
   kRadix11bits,
+  kRadix11bitsExtraPass,
   kWarpAuto,
   kWarpImmediate,
   kWarpFiltered,
@@ -63,6 +66,7 @@ inline auto operator<<(std::ostream& os, const Algo& algo) -> std::ostream&
     case Algo::kPublicApi: return os << "kPublicApi";
     case Algo::kRadix8bits: return os << "kRadix8bits";
     case Algo::kRadix11bits: return os << "kRadix11bits";
+    case Algo::kRadix11bitsExtraPass: return os << "kRadix11bitsExtraPass";
     case Algo::kWarpAuto: return os << "kWarpAuto";
     case Algo::kWarpImmediate: return os << "kWarpImmediate";
     case Algo::kWarpFiltered: return os << "kWarpFiltered";
@@ -103,11 +107,38 @@ void select_k_impl(const device_resources& handle,
       }
     }
     case Algo::kRadix8bits:
-      return detail::select::radix::select_k<T, IdxT, 8, 512>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+      return detail::select::radix::select_k<T, IdxT, 8, 512>(in,
+                                                              in_idx,
+                                                              batch_size,
+                                                              len,
+                                                              k,
+                                                              out,
+                                                              out_idx,
+                                                              select_min,
+                                                              true,  // fused_last_filter
+                                                              stream);
     case Algo::kRadix11bits:
-      return detail::select::radix::select_k<T, IdxT, 11, 512>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+      return detail::select::radix::select_k<T, IdxT, 11, 512>(in,
+                                                               in_idx,
+                                                               batch_size,
+                                                               len,
+                                                               k,
+                                                               out,
+                                                               out_idx,
+                                                               select_min,
+                                                               true,  // fused_last_filter
+                                                               stream);
+    case Algo::kRadix11bitsExtraPass:
+      return detail::select::radix::select_k<T, IdxT, 11, 512>(in,
+                                                               in_idx,
+                                                               batch_size,
+                                                               len,
+                                                               k,
+                                                               out,
+                                                               out_idx,
+                                                               select_min,
+                                                               false,  // fused_last_filter
+                                                               stream);
     case Algo::kWarpAuto:
       return detail::select::warpsort::select_k<T, IdxT>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 942c096e58..47d6f068e3 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -21,7 +21,7 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/matrix/specializations/detail/select_k.cuh>
 #endif
 
diff --git a/cpp/scripts/analyze_nvcc_log.py b/cpp/scripts/analyze_nvcc_log.py
new file mode 100755
index 0000000000..d06e05d265
--- /dev/null
+++ b/cpp/scripts/analyze_nvcc_log.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from matplotlib import colors
+
+def main(input_path):
+    input_path = Path(input_path)
+    print("-- loading data")
+    df = pd.read_csv(input_path)
+
+    print("-- analyzing data")
+    # Strip spaces from column names
+    df = df.rename(columns=str.strip)
+    df["seconds"] = df["metric"] / 1000
+    df["file"] = df["source file name"]
+    df["phase"] = df["phase name"].str.strip()
+
+    dfp = (df
+           # Remove nvcc driver entries. They don't contain a source file name
+           .query("phase!='nvcc (driver)'")
+           # Make a pivot table containing files as row, phase (preprocessing,
+           # cicc, etc.) as column and the total times as table entries. NOTE:
+           # if compiled for multiple archs, the archs will be summed.
+           .pivot_table(index="file", values="seconds", columns="phase", aggfunc='sum'))
+
+    dfp_sum = dfp.sum(axis="columns")
+
+    df_fraction = dfp.divide(dfp_sum, axis="index")
+    df_fraction["total time"] = dfp_sum
+    df_fraction = df_fraction.melt(ignore_index=False, id_vars="total time", var_name="phase", value_name="fraction")
+
+    dfp["total time"] = dfp_sum
+    df_absolute = dfp.melt(ignore_index=False, id_vars="total time", var_name="phase", value_name="seconds")
+
+    # host: light red to dark red (preprocessing, cudafe, gcc (compiling))
+    # device: ligt green to dark green (preprocessing, cicc, ptxas)
+    palette = {
+        "gcc (preprocessing 4)": colors.hsv_to_rgb((0, 1, 1)),
+        'cudafe++': colors.hsv_to_rgb((0, 1, .75)),
+        'gcc (compiling)': colors.hsv_to_rgb((0, 1, .4)),
+        "gcc (preprocessing 1)": colors.hsv_to_rgb((.33, 1, 1)),
+        'cicc': colors.hsv_to_rgb((.33, 1, 0.75)),
+        'ptxas': colors.hsv_to_rgb((.33, 1, 0.4)),
+        'fatbinary': "grey",
+    }
+
+    print("-- Ten longest translation units:")
+    colwidth = pd.get_option('display.max_colwidth') - 1
+    dfp = dfp.reset_index()
+    dfp["file"] = dfp["file"].apply(lambda s: s[-colwidth:])
+    print(dfp.sort_values("total time", ascending=False).reset_index().loc[:10])
+
+    print("-- Plotting absolute compile times")
+    abs_out_path = f"{input_path}.absolute.compile_times.png"
+    sns.displot(
+        df_absolute.sort_values("total time").reset_index(),
+        y="file",
+        hue="phase",
+        hue_order=reversed(
+            ["gcc (preprocessing 4)", 'cudafe++', 'gcc (compiling)',
+             "gcc (preprocessing 1)", 'cicc', 'ptxas',
+             'fatbinary',
+        ]),
+        palette=palette,
+        weights="seconds",
+        multiple="stack",
+        kind="hist",
+        height=20,
+    )
+    plt.xlabel("seconds");
+    plt.savefig(abs_out_path)
+    print(f"-- Wrote absolute compile time plot to {abs_out_path}")
+
+    print("-- Plotting relative compile times")
+    rel_out_path = f"{input_path}.relative.compile_times.png"
+    sns.displot(
+        df_fraction.sort_values('total time').reset_index(),
+        y="file",
+        hue="phase",
+        hue_order=reversed(["gcc (preprocessing 4)", 'cudafe++', 'gcc (compiling)',
+                   "gcc (preprocessing 1)", 'cicc', 'ptxas',
+                   'fatbinary',
+                  ]),
+        palette=palette,
+        weights="fraction",
+        multiple="stack",
+        kind="hist",
+        height=15,
+    )
+    plt.xlabel("fraction");
+    plt.savefig(rel_out_path)
+    print(f"-- Wrote relative compile time plot to {rel_out_path}")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        printf("""NVCC log analyzer
+
+        Analyzes nvcc logs and outputs a figure with highest ranking translation
+        units.
+
+        Usage:
+        python analyze_nvcc_log.py <nvcc_log_file.csv>
+        cpp/scripts/analyze_nvcc_log.py <nvcc_log_file.csv>
+
+        Generate the nvcc log file by adding:
+
+        list(APPEND RAFT_CUDA_FLAGS "--time=CMakeFiles/nvcc_compile_log.csv")
+
+        to cpp/cmake/modules/ConfigureCUDA.cmake.
+        """)
+
+    input_path = Path(sys.argv[1])
+    if not input_path.exists():
+        print(f"Path {input_path} does not exist.")
+    else:
+        main(input_path)
diff --git a/cpp/src/distance/cluster/cluster_cost.cuh b/cpp/src/cluster/cluster_cost.cuh
similarity index 100%
rename from cpp/src/distance/cluster/cluster_cost.cuh
rename to cpp/src/cluster/cluster_cost.cuh
diff --git a/cpp/src/distance/cluster/cluster_cost_double.cu b/cpp/src/cluster/cluster_cost_double.cu
similarity index 100%
rename from cpp/src/distance/cluster/cluster_cost_double.cu
rename to cpp/src/cluster/cluster_cost_double.cu
diff --git a/cpp/src/distance/cluster/cluster_cost_float.cu b/cpp/src/cluster/cluster_cost_float.cu
similarity index 100%
rename from cpp/src/distance/cluster/cluster_cost_float.cu
rename to cpp/src/cluster/cluster_cost_float.cu
diff --git a/cpp/src/distance/cluster/kmeans_fit_double.cu b/cpp/src/cluster/kmeans_fit_double.cu
similarity index 100%
rename from cpp/src/distance/cluster/kmeans_fit_double.cu
rename to cpp/src/cluster/kmeans_fit_double.cu
diff --git a/cpp/src/distance/cluster/kmeans_fit_float.cu b/cpp/src/cluster/kmeans_fit_float.cu
similarity index 100%
rename from cpp/src/distance/cluster/kmeans_fit_float.cu
rename to cpp/src/cluster/kmeans_fit_float.cu
diff --git a/cpp/src/distance/cluster/kmeans_init_plus_plus_double.cu b/cpp/src/cluster/kmeans_init_plus_plus_double.cu
similarity index 100%
rename from cpp/src/distance/cluster/kmeans_init_plus_plus_double.cu
rename to cpp/src/cluster/kmeans_init_plus_plus_double.cu
diff --git a/cpp/src/distance/cluster/kmeans_init_plus_plus_float.cu b/cpp/src/cluster/kmeans_init_plus_plus_float.cu
similarity index 100%
rename from cpp/src/distance/cluster/kmeans_init_plus_plus_float.cu
rename to cpp/src/cluster/kmeans_init_plus_plus_float.cu
diff --git a/cpp/src/distance/cluster/update_centroids.cuh b/cpp/src/cluster/update_centroids.cuh
similarity index 100%
rename from cpp/src/distance/cluster/update_centroids.cuh
rename to cpp/src/cluster/update_centroids.cuh
diff --git a/cpp/src/distance/cluster/update_centroids_double.cu b/cpp/src/cluster/update_centroids_double.cu
similarity index 100%
rename from cpp/src/distance/cluster/update_centroids_double.cu
rename to cpp/src/cluster/update_centroids_double.cu
diff --git a/cpp/src/distance/cluster/update_centroids_float.cu b/cpp/src/cluster/update_centroids_float.cu
similarity index 100%
rename from cpp/src/distance/cluster/update_centroids_float.cu
rename to cpp/src/cluster/update_centroids_float.cu
diff --git a/cpp/src/distance/distance/specializations/detail/canberra_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/canberra_double_double_double_int.cu
deleted file mode 100644
index 4e9e608792..0000000000
--- a/cpp/src/distance/distance/specializations/detail/canberra_double_double_double_int.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::Canberra, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/correlation_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/correlation_double_double_double_int.cu
deleted file mode 100644
index 2df77a4b5d..0000000000
--- a/cpp/src/distance/distance/specializations/detail/correlation_double_double_double_int.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void
-distance<raft::distance::DistanceType::CorrelationExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/correlation_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/correlation_float_float_float_int.cu
deleted file mode 100644
index 76ed00afa6..0000000000
--- a/cpp/src/distance/distance/specializations/detail/correlation_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/cosine_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/cosine_double_double_double_int.cu
deleted file mode 100644
index 3e0bcb92ed..0000000000
--- a/cpp/src/distance/distance/specializations/detail/cosine_double_double_double_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void distance<raft::distance::DistanceType::CosineExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/cosine_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/cosine_float_float_float_int.cu
deleted file mode 100644
index 23131ce2c7..0000000000
--- a/cpp/src/distance/distance/specializations/detail/cosine_float_float_float_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void distance<raft::distance::DistanceType::CosineExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
deleted file mode 100644
index b618fd024c..0000000000
--- a/cpp/src/distance/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void
-distance<raft::distance::DistanceType::HammingUnexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
deleted file mode 100644
index 18e7aad9e9..0000000000
--- a/cpp/src/distance/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
deleted file mode 100644
index 08ab20cfe5..0000000000
--- a/cpp/src/distance/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void
-distance<raft::distance::DistanceType::HellingerExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
deleted file mode 100644
index 79eed075fb..0000000000
--- a/cpp/src/distance/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
deleted file mode 100644
index ed84ee6dc4..0000000000
--- a/cpp/src/distance/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::JensenShannon, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
deleted file mode 100644
index a241af767c..0000000000
--- a/cpp/src/distance/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::JensenShannon, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu
deleted file mode 100644
index c4c944d123..0000000000
--- a/cpp/src/distance/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu
deleted file mode 100644
index aa1db5a837..0000000000
--- a/cpp/src/distance/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l1_double_double_double_int.cu
deleted file mode 100644
index 391a1c2aa4..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l1_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L1, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu
deleted file mode 100644
index 8c5f746fa2..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu
deleted file mode 100644
index c266125f98..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
deleted file mode 100644
index 399b120527..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
deleted file mode 100644
index 66de212b8e..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 562d93b2de..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
deleted file mode 100644
index 386bbafc5f..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 7733c3af48..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
deleted file mode 100644
index 4ea18d31de..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
deleted file mode 100644
index 74414f8fd6..0000000000
--- a/cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void distance<raft::distance::DistanceType::Linf, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 402cb51b7e..0000000000
--- a/cpp/src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template void distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
deleted file mode 100644
index 7efe2b3349..0000000000
--- a/cpp/src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
deleted file mode 100644
index b1e6f5e1f4..0000000000
--- a/cpp/src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void
-distance<raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu
deleted file mode 100644
index 1e12bcd705..0000000000
--- a/cpp/src/distance/distance/specializations/detail/russel_rao_float_float_float_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu
similarity index 100%
rename from cpp/src/distance/distance/fused_l2_min_arg.cu
rename to cpp/src/distance/fused_l2_min_arg.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_search_float_int64_t.cu b/cpp/src/distance/neighbors/specializations/ivfflat_search_float_int64_t.cu
deleted file mode 100644
index 6de65546c8..0000000000
--- a/cpp/src/distance/neighbors/specializations/ivfflat_search_float_int64_t.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                        \
-  template void search(raft::device_resources const&,                      \
-                       raft::neighbors::ivf_flat::search_params const&,    \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,   \
-                       raft::device_matrix_view<const T, IdxT, row_major>, \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,    \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu b/cpp/src/distance/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
deleted file mode 100644
index 8eda240ccd..0000000000
--- a/cpp/src/distance/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                        \
-  template void search(raft::device_resources const&,                      \
-                       raft::neighbors::ivf_flat::search_params const&,    \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,   \
-                       raft::device_matrix_view<const T, IdxT, row_major>, \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,    \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu b/cpp/src/distance/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
deleted file mode 100644
index 8ff6533628..0000000000
--- a/cpp/src/distance/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                        \
-  template void search(raft::device_resources const&,                      \
-                       raft::neighbors::ivf_flat::search_params const&,    \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,   \
-                       raft::device_matrix_view<const T, IdxT, row_major>, \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,    \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/distance/distance/pairwise_distance.cu b/cpp/src/distance/pairwise_distance.cu
similarity index 100%
rename from cpp/src/distance/distance/pairwise_distance.cu
rename to cpp/src/distance/pairwise_distance.cu
diff --git a/cpp/src/distance/specializations/detail/00_write_template.py b/cpp/src/distance/specializations/detail/00_write_template.py
new file mode 100644
index 0000000000..3f2f853569
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/00_write_template.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+# NOTE: this template is not perfectly formatted. Use pre-commit to get
+# everything in shape again.
+template = """/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp> // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh> // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh> // pairwise_matrix_instantiation_point
+INCLUDE_SM_HEADERS
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<OpT,
+                                                  IdxT,
+                                                  DataT,
+                                                  OutT,
+                                                  FinopT>(
+  OpT,
+  pairwise_matrix_params<IdxT, DataT, OutT, FinopT>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
+"""
+
+data_type_instances = [
+    dict(
+        DataT="float",
+        AccT="float",
+        OutT="float",
+        IdxT="int",
+    ),
+    dict(
+        DataT="double",
+        AccT="double",
+        OutT="double",
+        IdxT="int",
+    ),
+]
+
+op_instances = [
+    dict(
+        path_prefix="canberra",
+        OpT="ops::canberra_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="correlation",
+        OpT="ops::correlation_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="cosine",
+        OpT="ops::cosine_distance_op<DataT, AccT, IdxT>",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="hamming_unexpanded",
+        OpT="ops::hamming_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="hellinger_expanded",
+        OpT="ops::hellinger_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    # inner product is handled by cublas.
+    dict(
+        path_prefix="jensen_shannon",
+        OpT="ops::jensen_shannon_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="kl_divergence",
+        OpT="ops::kl_divergence_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l1",
+        OpT="ops::l1_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l2_expanded",
+        OpT="ops::l2_exp_distance_op<DataT, AccT, IdxT>",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="l2_unexpanded",
+        OpT="ops::l2_unexp_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l_inf",
+        OpT="ops::l_inf_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="lp_unexpanded",
+        OpT="ops::lp_unexp_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="russel_rao",
+        OpT="ops::russel_rao_distance_op<DataT, AccT, IdxT>",
+        archs = [60],
+     ),
+]
+
+def fill_in(s, template):
+    for k, v in template.items():
+        s = s.replace(k, v)
+    return s
+
+def fill_include_sm_headers(op_instance):
+    include_headers ="\n".join([
+        f"#include <raft/distance/detail/pairwise_matrix/dispatch_sm{arch}.cuh>"
+        for arch in op_instance["archs"]
+    ])
+
+    return {
+        "path_prefix": op_instance["path_prefix"],
+        "OpT": op_instance["OpT"],
+        "INCLUDE_SM_HEADERS": include_headers
+    }
+
+for op_instance in op_instances:
+    op_instance = fill_include_sm_headers(op_instance)
+
+    for data_type_instance in data_type_instances:
+        op_data_instance = {
+            k : fill_in(v, data_type_instance)
+            for k, v in op_instance.items()
+        }
+        instance = {
+            **op_data_instance,
+            **data_type_instance,
+            "FinopT": "decltype(raft::identity_op())",
+        }
+
+        text = fill_in(template, instance)
+
+        path = fill_in("path_prefix_DataT_AccT_OutT_IdxT.cu", instance)
+        with open(path, "w") as f:
+            f.write(text)
diff --git a/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu b/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
new file mode 100644
index 0000000000..037d218178
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::canberra_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::canberra_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu b/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
new file mode 100644
index 0000000000..0ed8ea7bb0
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::canberra_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::canberra_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu b/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
new file mode 100644
index 0000000000..0c11f0621e
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::correlation_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::correlation_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu b/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
new file mode 100644
index 0000000000..396e158554
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::correlation_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::correlation_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu b/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
new file mode 100644
index 0000000000..e9afb6f563
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::cosine_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu b/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
new file mode 100644
index 0000000000..1033c491d6
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::cosine_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..195115914d
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::hamming_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::hamming_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..a74c6c404e
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::hamming_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::hamming_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..bac1dd7bd0
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::hellinger_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::hellinger_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..77c113b1a9
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::hellinger_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::hellinger_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/distance/specializations/detail/inner_product_double_double_double_int.cu b/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/inner_product_double_double_double_int.cu
rename to cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
diff --git a/cpp/src/distance/distance/specializations/detail/inner_product_float_float_float_int.cu b/cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/inner_product_float_float_float_int.cu
rename to cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
new file mode 100644
index 0000000000..188e52c152
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void
+  pairwise_matrix_instantiation_point<ops::jensen_shannon_distance_op<double, double, int>,
+                                      int,
+                                      double,
+                                      double,
+                                      decltype(raft::identity_op())>(
+    ops::jensen_shannon_distance_op<double, double, int>,
+    pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+    cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
new file mode 100644
index 0000000000..b0afbf7bb2
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void
+  pairwise_matrix_instantiation_point<ops::jensen_shannon_distance_op<float, float, int>,
+                                      int,
+                                      float,
+                                      float,
+                                      decltype(raft::identity_op())>(
+    ops::jensen_shannon_distance_op<float, float, int>,
+    pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+    cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_double.cu
rename to cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/gram_matrix_base_float.cu
rename to cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
rename to cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
rename to cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_double.cu
rename to cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/rbf_kernel_float.cu
rename to cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_double.cu
rename to cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
diff --git a/cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/kernels/tanh_kernel_float.cu
rename to cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
new file mode 100644
index 0000000000..f06ae85414
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::kl_divergence_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
new file mode 100644
index 0000000000..00d5a5ee5b
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::kl_divergence_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
new file mode 100644
index 0000000000..5c235316da
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l1_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::l1_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
new file mode 100644
index 0000000000..fb293ca83d
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l1_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::l1_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..2c02f0224f
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::l2_exp_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..85e25a25ca
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::l2_exp_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..5b4d995d14
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l2_unexp_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::l2_unexp_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..a63c3f0bb8
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l2_unexp_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::l2_unexp_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
new file mode 100644
index 0000000000..831167523f
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::l_inf_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
new file mode 100644
index 0000000000..02e667cbe3
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::l_inf_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..ebd71065ec
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::lp_unexp_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::lp_unexp_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..b94a81fdce
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::lp_unexp_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::lp_unexp_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu b/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
new file mode 100644
index 0000000000..6f952fcc37
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::russel_rao_distance_op<double, double, int>,
+                                                  int,
+                                                  double,
+                                                  double,
+                                                  decltype(raft::identity_op())>(
+  ops::russel_rao_distance_op<double, double, int>,
+  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu b/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
new file mode 100644
index 0000000000..3223ce33a7
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                            // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+
+namespace raft::distance::detail {
+
+template void pairwise_matrix_instantiation_point<ops::russel_rao_distance_op<float, float, int>,
+                                                  int,
+                                                  float,
+                                                  float,
+                                                  decltype(raft::identity_op())>(
+  ops::russel_rao_distance_op<float, float, int>,
+  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
+  cudaStream_t);
+
+}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/distance/specializations/fused_l2_nn_double_int.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/fused_l2_nn_double_int.cu
rename to cpp/src/distance/specializations/fused_l2_nn_double_int.cu
diff --git a/cpp/src/distance/distance/specializations/fused_l2_nn_double_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/fused_l2_nn_double_int64.cu
rename to cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
diff --git a/cpp/src/distance/distance/specializations/fused_l2_nn_float_int.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/fused_l2_nn_float_int.cu
rename to cpp/src/distance/specializations/fused_l2_nn_float_int.cu
diff --git a/cpp/src/distance/distance/specializations/fused_l2_nn_float_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/fused_l2_nn_float_int64.cu
rename to cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
diff --git a/cpp/src/distance/matrix/specializations/detail/select_k_float_int64_t.cu b/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/matrix/specializations/detail/select_k_float_int64_t.cu
rename to cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
diff --git a/cpp/src/distance/matrix/specializations/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
similarity index 100%
rename from cpp/src/distance/matrix/specializations/detail/select_k_float_uint32_t.cu
rename to cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
diff --git a/cpp/src/distance/matrix/specializations/detail/select_k_half_int64_t.cu b/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
similarity index 100%
rename from cpp/src/distance/matrix/specializations/detail/select_k_half_int64_t.cu
rename to cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
diff --git a/cpp/src/distance/matrix/specializations/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu
similarity index 100%
rename from cpp/src/distance/matrix/specializations/detail/select_k_half_uint32_t.cu
rename to cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float.cu
new file mode 100644
index 0000000000..88545b3607
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/neighbors/brute_force.cuh>
+
+#include <raft/neighbors/specializations.cuh>
+
+#include <raft_runtime/neighbors/brute_force.hpp>
+
+#include <vector>
+
+namespace raft::runtime::neighbors::brute_force {
+
+#define RAFT_INST_BFKNN(IDX_T, DATA_T, MATRIX_IDX_T, INDEX_LAYOUT, SEARCH_LAYOUT)        \
+  void knn(raft::device_resources const& handle,                                         \
+           raft::device_matrix_view<const DATA_T, MATRIX_IDX_T, INDEX_LAYOUT> index,     \
+           raft::device_matrix_view<const DATA_T, MATRIX_IDX_T, SEARCH_LAYOUT> search,   \
+           raft::device_matrix_view<IDX_T, MATRIX_IDX_T, row_major> indices,             \
+           raft::device_matrix_view<DATA_T, MATRIX_IDX_T, row_major> distances,          \
+           distance::DistanceType metric,                                                \
+           std::optional<float> metric_arg,                                              \
+           std::optional<IDX_T> global_id_offset)                                        \
+  {                                                                                      \
+    std::vector<raft::device_matrix_view<const DATA_T, MATRIX_IDX_T, INDEX_LAYOUT>> vec; \
+    vec.push_back(index);                                                                \
+    raft::neighbors::brute_force::knn(                                                   \
+      handle, vec, search, indices, distances, metric, metric_arg, global_id_offset);    \
+  }
+
+RAFT_INST_BFKNN(int64_t, float, int64_t, raft::row_major, raft::row_major);
+
+#undef RAFT_INST_BFKNN
+
+}  // namespace raft::runtime::neighbors::brute_force
diff --git a/cpp/src/distance/neighbors/ivf_flat_build.cu b/cpp/src/neighbors/ivf_flat_build.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivf_flat_build.cu
rename to cpp/src/neighbors/ivf_flat_build.cu
diff --git a/cpp/src/distance/neighbors/ivf_flat_search.cu b/cpp/src/neighbors/ivf_flat_search.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivf_flat_search.cu
rename to cpp/src/neighbors/ivf_flat_search.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_build.cu b/cpp/src/neighbors/ivfpq_build.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivfpq_build.cu
rename to cpp/src/neighbors/ivfpq_build.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_deserialize.cu b/cpp/src/neighbors/ivfpq_deserialize.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivfpq_deserialize.cu
rename to cpp/src/neighbors/ivfpq_deserialize.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivfpq_search_float_int64_t.cu
rename to cpp/src/neighbors/ivfpq_search_float_int64_t.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivfpq_search_int8_t_int64_t.cu
rename to cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivfpq_search_uint8_t_int64_t.cu
rename to cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/ivfpq_serialize.cu b/cpp/src/neighbors/ivfpq_serialize.cu
similarity index 100%
rename from cpp/src/distance/neighbors/ivfpq_serialize.cu
rename to cpp/src/neighbors/ivfpq_serialize.cu
diff --git a/cpp/src/distance/neighbors/refine_d_int64_t_float.cu b/cpp/src/neighbors/refine_d_int64_t_float.cu
similarity index 100%
rename from cpp/src/distance/neighbors/refine_d_int64_t_float.cu
rename to cpp/src/neighbors/refine_d_int64_t_float.cu
diff --git a/cpp/src/distance/neighbors/refine_d_int64_t_int8_t.cu b/cpp/src/neighbors/refine_d_int64_t_int8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/refine_d_int64_t_int8_t.cu
rename to cpp/src/neighbors/refine_d_int64_t_int8_t.cu
diff --git a/cpp/src/distance/neighbors/refine_d_int64_t_uint8_t.cu b/cpp/src/neighbors/refine_d_int64_t_uint8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/refine_d_int64_t_uint8_t.cu
rename to cpp/src/neighbors/refine_d_int64_t_uint8_t.cu
diff --git a/cpp/src/distance/neighbors/refine_h_int64_t_float.cu b/cpp/src/neighbors/refine_h_int64_t_float.cu
similarity index 100%
rename from cpp/src/distance/neighbors/refine_h_int64_t_float.cu
rename to cpp/src/neighbors/refine_h_int64_t_float.cu
diff --git a/cpp/src/distance/neighbors/refine_h_int64_t_int8_t.cu b/cpp/src/neighbors/refine_h_int64_t_int8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/refine_h_int64_t_int8_t.cu
rename to cpp/src/neighbors/refine_h_int64_t_int8_t.cu
diff --git a/cpp/src/distance/neighbors/refine_h_int64_t_uint8_t.cu b/cpp/src/neighbors/refine_h_int64_t_uint8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/refine_h_int64_t_uint8_t.cu
rename to cpp/src/neighbors/refine_h_int64_t_uint8_t.cu
diff --git a/cpp/src/nn/specializations/ball_cover_all_knn_query.cu b/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
similarity index 80%
rename from cpp/src/nn/specializations/ball_cover_all_knn_query.cu
rename to cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
index d9cb836bfc..305dd6796e 100644
--- a/cpp/src/nn/specializations/ball_cover_all_knn_query.cu
+++ b/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
@@ -16,14 +16,7 @@
 
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/neighbors/ball_cover_types.hpp>
-
-// Ignore upstream specializations to avoid unnecessary recompiling
-#ifdef RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 
 #include <cstdint>
 
diff --git a/cpp/src/nn/specializations/ball_cover_build_index.cu b/cpp/src/neighbors/specializations/ball_cover_build_index.cu
similarity index 81%
rename from cpp/src/nn/specializations/ball_cover_build_index.cu
rename to cpp/src/neighbors/specializations/ball_cover_build_index.cu
index 76c5a2bd5b..ec7f4bcf52 100644
--- a/cpp/src/nn/specializations/ball_cover_build_index.cu
+++ b/cpp/src/neighbors/specializations/ball_cover_build_index.cu
@@ -16,14 +16,7 @@
 
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/neighbors/ball_cover_types.hpp>
-
-// Ignore upstream specializations to avoid unnecessary recompiling
-#ifdef RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 
 #include <cstdint>
 
diff --git a/cpp/src/nn/specializations/ball_cover_knn_query.cu b/cpp/src/neighbors/specializations/ball_cover_knn_query.cu
similarity index 80%
rename from cpp/src/nn/specializations/ball_cover_knn_query.cu
rename to cpp/src/neighbors/specializations/ball_cover_knn_query.cu
index c01da452b2..634427200e 100644
--- a/cpp/src/nn/specializations/ball_cover_knn_query.cu
+++ b/cpp/src/neighbors/specializations/ball_cover_knn_query.cu
@@ -14,18 +14,10 @@
  * limitations under the License.
  */
 
+#include <cstdint>
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/neighbors/ball_cover_types.hpp>
-
-// Ignore upstream specializations to avoid unnecessary recompiling
-#ifdef RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
-#include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 
 namespace raft::neighbors::ball_cover {
 template void knn_query<std::int64_t, float, std::uint32_t>(
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
similarity index 91%
rename from cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
rename to cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
index 9a71ce4f9a..b69751a62a 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+++ b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
similarity index 91%
rename from cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
rename to cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
index b1b3439e8f..ca44ad3165 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+++ b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
similarity index 91%
rename from cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
rename to cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
index 9f512dcda1..ba44327653 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+++ b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
@@ -15,11 +15,8 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
similarity index 91%
rename from cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
rename to cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
index 0eeb448d1e..59132c1f99 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+++ b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
new file mode 100644
index 0000000000..04aa42c9f1
--- /dev/null
+++ b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/brute_force.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::detail {
+#define RAFT_INST(IdxT, T, IntT)                                                          \
+  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
+                                                    std::vector<T*>& input,               \
+                                                    std::vector<IntT>& sizes,             \
+                                                    IntT D,                               \
+                                                    T* search_items,                      \
+                                                    IntT n,                               \
+                                                    IdxT* res_I,                          \
+                                                    T* res_D,                             \
+                                                    IntT k,                               \
+                                                    bool rowMajorIndex,                   \
+                                                    bool rowMajorQuery,                   \
+                                                    std::vector<IdxT>* translations,      \
+                                                    raft::distance::DistanceType metric,  \
+                                                    float metricArg,                      \
+                                                    raft::identity_op);
+RAFT_INST(long, float, int);
+#undef RAFT_INST
+}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
new file mode 100644
index 0000000000..a8b9d4299a
--- /dev/null
+++ b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/brute_force.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::detail {
+#define RAFT_INST(IdxT, T, IntT)                                                          \
+  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
+                                                    std::vector<T*>& input,               \
+                                                    std::vector<IntT>& sizes,             \
+                                                    IntT D,                               \
+                                                    T* search_items,                      \
+                                                    IntT n,                               \
+                                                    IdxT* res_I,                          \
+                                                    T* res_D,                             \
+                                                    IntT k,                               \
+                                                    bool rowMajorIndex,                   \
+                                                    bool rowMajorQuery,                   \
+                                                    std::vector<IdxT>* translations,      \
+                                                    raft::distance::DistanceType metric,  \
+                                                    float metricArg,                      \
+                                                    raft::identity_op);
+RAFT_INST(long, float, unsigned int);
+#undef RAFT_INST
+}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
new file mode 100644
index 0000000000..c97e6e936a
--- /dev/null
+++ b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/brute_force.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::detail {
+#define RAFT_INST(IdxT, T, IntT)                                                          \
+  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
+                                                    std::vector<T*>& input,               \
+                                                    std::vector<IntT>& sizes,             \
+                                                    IntT D,                               \
+                                                    T* search_items,                      \
+                                                    IntT n,                               \
+                                                    IdxT* res_I,                          \
+                                                    T* res_D,                             \
+                                                    IntT k,                               \
+                                                    bool rowMajorIndex,                   \
+                                                    bool rowMajorQuery,                   \
+                                                    std::vector<IdxT>* translations,      \
+                                                    raft::distance::DistanceType metric,  \
+                                                    float metricArg,                      \
+                                                    raft::identity_op);
+RAFT_INST(uint32_t, float, int);
+#undef RAFT_INST
+}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
new file mode 100644
index 0000000000..87451c385a
--- /dev/null
+++ b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/brute_force.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::detail {
+#define RAFT_INST(IdxT, T, IntT)                                                          \
+  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
+                                                    std::vector<T*>& input,               \
+                                                    std::vector<IntT>& sizes,             \
+                                                    IntT D,                               \
+                                                    T* search_items,                      \
+                                                    IntT n,                               \
+                                                    IdxT* res_I,                          \
+                                                    T* res_D,                             \
+                                                    IntT k,                               \
+                                                    bool rowMajorIndex,                   \
+                                                    bool rowMajorQuery,                   \
+                                                    std::vector<IdxT>* translations,      \
+                                                    raft::distance::DistanceType metric,  \
+                                                    float metricArg,                      \
+                                                    raft::identity_op);
+RAFT_INST(uint32_t, float, unsigned int);
+#undef RAFT_INST
+}  // namespace raft::neighbors::detail
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_float_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_half_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_half_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
diff --git a/cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
rename to cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
diff --git a/cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
similarity index 93%
rename from cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu
rename to cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
index 41cf409416..72fdac9526 100644
--- a/cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu
+++ b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
similarity index 93%
rename from cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu
rename to cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
index 7d183d7220..c7616462fe 100644
--- a/cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu
+++ b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
similarity index 93%
rename from cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu
rename to cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
index fa273986dc..16bf058238 100644
--- a/cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu
+++ b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
similarity index 93%
rename from cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu
rename to cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
index 5313a87786..06cf55eae3 100644
--- a/cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu
+++ b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cstdint>
+#include <raft/neighbors/specializations.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 
-// TODO: Change this to proper specializations after FAISS is removed
-#include <raft/spatial/knn/specializations.cuh>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_build_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfflat_build_float_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_extend_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfflat_extend_float_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
new file mode 100644
index 0000000000..dce7083139
--- /dev/null
+++ b/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::ivf_flat {
+
+// greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
+// function is used in both raft::neighbors::ivf_flat::search and
+// raft::neighbors::detail::refine_device. To prevent a duplicate instantiation
+// of this function (which defines ~270 kernels) in the refine specializations,
+// an extern template definition is provided. To make sure
+// ivfflat_interleaved_scan is actually compiled here, we explicitly instantiate
+// it below. Please check related function calls after editing template
+// definition below. Search for `greppable-id-specializations-ivf-flat-search`
+// to find them.
+#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
+    T,                                                                       \
+    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
+    IdxT>(const index<T, IdxT>& index,                                       \
+          const T* queries,                                                  \
+          const uint32_t* coarse_query_results,                              \
+          const uint32_t n_queries,                                          \
+          const raft::distance::DistanceType metric,                         \
+          const uint32_t n_probes,                                           \
+          const uint32_t k,                                                  \
+          const bool select_min,                                             \
+          IdxT* neighbors,                                                   \
+          float* distances,                                                  \
+          uint32_t& grid_dim_x,                                              \
+          rmm::cuda_stream_view stream);                                     \
+                                                                             \
+  template void search(raft::device_resources const&,                        \
+                       raft::neighbors::ivf_flat::search_params const&,      \
+                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
+                       raft::device_matrix_view<const T, IdxT, row_major>,   \
+                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
+                       raft::device_matrix_view<float, IdxT, row_major>);
+
+RAFT_MAKE_INSTANCE(float, int64_t);
+
+#undef RAFT_MAKE_INSTANCE
+
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
new file mode 100644
index 0000000000..b03d878bae
--- /dev/null
+++ b/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::ivf_flat {
+
+#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
+    T,                                                                       \
+    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
+    IdxT>(const index<T, IdxT>& index,                                       \
+          const T* queries,                                                  \
+          const uint32_t* coarse_query_results,                              \
+          const uint32_t n_queries,                                          \
+          const raft::distance::DistanceType metric,                         \
+          const uint32_t n_probes,                                           \
+          const uint32_t k,                                                  \
+          const bool select_min,                                             \
+          IdxT* neighbors,                                                   \
+          float* distances,                                                  \
+          uint32_t& grid_dim_x,                                              \
+          rmm::cuda_stream_view stream);                                     \
+                                                                             \
+  template void search(raft::device_resources const&,                        \
+                       raft::neighbors::ivf_flat::search_params const&,      \
+                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
+                       raft::device_matrix_view<const T, IdxT, row_major>,   \
+                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
+                       raft::device_matrix_view<float, IdxT, row_major>);
+
+RAFT_MAKE_INSTANCE(int8_t, int64_t);
+
+#undef RAFT_MAKE_INSTANCE
+
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..2d42bae0d1
--- /dev/null
+++ b/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/specializations.cuh>
+
+namespace raft::neighbors::ivf_flat {
+
+#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
+    T,                                                                       \
+    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
+    IdxT>(const index<T, IdxT>& index,                                       \
+          const T* queries,                                                  \
+          const uint32_t* coarse_query_results,                              \
+          const uint32_t n_queries,                                          \
+          const raft::distance::DistanceType metric,                         \
+          const uint32_t n_probes,                                           \
+          const uint32_t k,                                                  \
+          const bool select_min,                                             \
+          IdxT* neighbors,                                                   \
+          float* distances,                                                  \
+          uint32_t& grid_dim_x,                                              \
+          rmm::cuda_stream_view stream);                                     \
+                                                                             \
+  template void search(raft::device_resources const&,                        \
+                       raft::neighbors::ivf_flat::search_params const&,      \
+                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
+                       raft::device_matrix_view<const T, IdxT, row_major>,   \
+                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
+                       raft::device_matrix_view<float, IdxT, row_major>);
+
+RAFT_MAKE_INSTANCE(uint8_t, int64_t);
+
+#undef RAFT_MAKE_INSTANCE
+
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_build_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_build_float_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_extend_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_extend_float_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_search_float_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
rename to cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_int64_t_float.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/refine_d_int64_t_float.cu
rename to cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_int64_t_int8_t.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/refine_d_int64_t_int8_t.cu
rename to cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_int64_t_uint8_t.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/refine_d_int64_t_uint8_t.cu
rename to cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_int64_t_float.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/refine_h_int64_t_float.cu
rename to cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_int64_t_int8_t.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/refine_h_int64_t_int8_t.cu
rename to cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_int64_t_uint8_t.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
similarity index 100%
rename from cpp/src/distance/neighbors/specializations/refine_h_int64_t_uint8_t.cu
rename to cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
diff --git a/cpp/src/distance/random/common.cuh b/cpp/src/random/common.cuh
similarity index 100%
rename from cpp/src/distance/random/common.cuh
rename to cpp/src/random/common.cuh
diff --git a/cpp/src/distance/random/rmat_rectangular_generator_int64_double.cu b/cpp/src/random/rmat_rectangular_generator_int64_double.cu
similarity index 93%
rename from cpp/src/distance/random/rmat_rectangular_generator_int64_double.cu
rename to cpp/src/random/rmat_rectangular_generator_int64_double.cu
index 1b8fb8bd6d..657aa0533c 100644
--- a/cpp/src/distance/random/rmat_rectangular_generator_int64_double.cu
+++ b/cpp/src/random/rmat_rectangular_generator_int64_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/random/rmat_rectangular_generator_int64_float.cu b/cpp/src/random/rmat_rectangular_generator_int64_float.cu
similarity index 93%
rename from cpp/src/distance/random/rmat_rectangular_generator_int64_float.cu
rename to cpp/src/random/rmat_rectangular_generator_int64_float.cu
index 249e8c2ffb..9cd748da89 100644
--- a/cpp/src/distance/random/rmat_rectangular_generator_int64_float.cu
+++ b/cpp/src/random/rmat_rectangular_generator_int64_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/random/rmat_rectangular_generator_int_double.cu b/cpp/src/random/rmat_rectangular_generator_int_double.cu
similarity index 93%
rename from cpp/src/distance/random/rmat_rectangular_generator_int_double.cu
rename to cpp/src/random/rmat_rectangular_generator_int_double.cu
index 3333b87983..1f10dbc03c 100644
--- a/cpp/src/distance/random/rmat_rectangular_generator_int_double.cu
+++ b/cpp/src/random/rmat_rectangular_generator_int_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/random/rmat_rectangular_generator_int_float.cu b/cpp/src/random/rmat_rectangular_generator_int_float.cu
similarity index 93%
rename from cpp/src/distance/random/rmat_rectangular_generator_int_float.cu
rename to cpp/src/random/rmat_rectangular_generator_int_float.cu
index db8d024c04..fecc134326 100644
--- a/cpp/src/distance/random/rmat_rectangular_generator_int_float.cu
+++ b/cpp/src/random/rmat_rectangular_generator_int_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/template/CMakeLists.txt b/cpp/template/CMakeLists.txt
new file mode 100644
index 0000000000..501a5c9503
--- /dev/null
+++ b/cpp/template/CMakeLists.txt
@@ -0,0 +1,38 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+
+# ------------- configure rapids-cmake --------------#
+
+include(cmake/thirdparty/fetch_rapids.cmake)
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+# ------------- configure project --------------#
+
+rapids_cuda_init_architectures(test_raft)
+
+project(test_raft LANGUAGES CXX CUDA)
+
+# ------------- configure raft -----------------#
+
+rapids_cpm_init()
+include(cmake/thirdparty/get_raft.cmake)
+
+# -------------- compile tasks ----------------- #
+add_executable(TEST_RAFT src/test_distance.cu)
+target_link_libraries(TEST_RAFT PRIVATE raft::raft raft::compiled)
diff --git a/cpp/template/README.md b/cpp/template/README.md
new file mode 100644
index 0000000000..348dff270a
--- /dev/null
+++ b/cpp/template/README.md
@@ -0,0 +1,18 @@
+# Example RAFT Project Template
+
+This template project provides a drop-in sample to either start building a new application with, or using RAFT in an existing CMake project. 
+
+First, please refer to our [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the minimum requirements to use RAFT.
+
+Once the minimum requirements are satisfied, this example template application can be built with the provided `build.sh` script. This is a bash script that calls the appropriate CMake commands, so you can look into it to see the typical CMake based build workflow.  
+
+This directory (`RAFT_SOURCE/cpp/template`) can be copied directly in order to build a new application with RAFT.
+
+RAFT can be integrated into an existing CMake project by copying the contents in the `configure rapids-cmake` and `configure raft` sections of the provided `CMakeLists.txt` into your project, along with `cmake/thirdparty/get_raft.cmake`. 
+
+Make sure to link against the appropriate Cmake targets. Use `raft::raft`to add make the headers available and `raft::compiled` when utilizing the shared library.
+
+```cmake
+target_link_libraries(your_app_target PRIVATE raft::raft raft::compiled)
+```
+
diff --git a/cpp/template/build.sh b/cpp/template/build.sh
new file mode 100755
index 0000000000..3ac00fc9af
--- /dev/null
+++ b/cpp/template/build.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+# raft empty project template build script
+
+# Abort script on first error
+set -e
+
+PARALLEL_LEVEL=${PARALLEL_LEVEL:=`nproc`}
+
+BUILD_TYPE=Release
+BUILD_DIR=build/
+
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
+set -e
+
+
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  exit 0
+fi
+
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
+cmake \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DRAFT_NVTX=OFF \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ ${EXTRA_CMAKE_ARGS} \
+ ../
+
+cmake  --build . -j${PARALLEL_LEVEL}
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
new file mode 100644
index 0000000000..248f4f1af4
--- /dev/null
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Use this variable to update RAPIDS and RAFT versions
+set(RAPIDS_VERSION "23.06")
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
+            ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
diff --git a/cpp/template/cmake/thirdparty/get_raft.cmake b/cpp/template/cmake/thirdparty/get_raft.cmake
new file mode 100644
index 0000000000..5463942adf
--- /dev/null
+++ b/cpp/template/cmake/thirdparty/get_raft.cmake
@@ -0,0 +1,62 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Use RAPIDS_VERSION from cmake/thirdparty/fetch_rapids.cmake
+set(RAFT_VERSION "${RAPIDS_VERSION}")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}")
+
+function(find_and_configure_raft)
+    set(oneValueArgs VERSION FORK PINNED_TAG COMPILE_LIBRARY ENABLE_NVTX ENABLE_MNMG_DEPENDENCIES)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set(RAFT_COMPONENTS "")
+    if(PKG_COMPILE_LIBRARY)
+        string(APPEND RAFT_COMPONENTS " compiled")
+    endif()
+
+    if(PKG_ENABLE_MNMG_DEPENDENCIES)
+        string(APPEND RAFT_COMPONENTS " distributed")
+    endif()
+
+    #-----------------------------------------------------
+    # Invoke CPM find_package()
+    #-----------------------------------------------------
+    rapids_cpm_find(raft ${PKG_VERSION}
+            GLOBAL_TARGETS      raft::raft
+            BUILD_EXPORT_SET    raft-template-exports
+            INSTALL_EXPORT_SET  raft-template-exports
+            COMPONENTS          ${RAFT_COMPONENTS}
+            CPM_ARGS
+            GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
+            GIT_TAG        ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR  cpp
+            OPTIONS
+            "BUILD_TESTS OFF"
+            "BUILD_BENCH OFF"
+            "RAFT_NVTX   ${ENABLE_NVTX}"
+            "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_LIBRARY}"
+            )
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_raft(VERSION  ${RAFT_VERSION}.00
+        FORK                     ${RAFT_FORK}
+        PINNED_TAG               ${RAFT_PINNED_TAG}
+        COMPILE_LIBRARY          ON
+        ENABLE_MNMG_DEPENDENCIES OFF
+        ENABLE_NVTX              OFF
+)
diff --git a/cpp/template/src/test_distance.cu b/cpp/template/src/test_distance.cu
new file mode 100644
index 0000000000..b86dde70e5
--- /dev/null
+++ b/cpp/template/src/test_distance.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/random/make_blobs.cuh>
+
+#ifdef RAFT_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+int main()
+{
+  raft::device_resources handle;
+
+  int n_samples  = 5000;
+  int n_features = 50;
+
+  auto input  = raft::make_device_matrix<float, int>(handle, n_samples, n_features);
+  auto labels = raft::make_device_vector<int, int>(handle, n_samples);
+  auto output = raft::make_device_matrix<float, int>(handle, n_samples, n_samples);
+
+  raft::random::make_blobs(handle, input.view(), labels.view());
+
+  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+  raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
+}
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index acfb470bd8..a778b0d195 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 function(ConfigureTest)
 
-  set(options OPTIONAL DIST NN)
+  set(options OPTIONAL LIB)
   set(oneValueArgs NAME)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -33,8 +33,7 @@ function(ConfigureTest)
     ${TEST_NAME}
     PRIVATE raft::raft
             raft_internal
-            $<$<BOOL:${ConfigureTest_DIST}>:raft::distance>
-            $<$<BOOL:${ConfigureTest_NN}>:raft::nn>
+            $<$<BOOL:${ConfigureTest_LIB}>:raft::compiled>
             GTest::gtest
             GTest::gtest_main
             Threads::Threads
@@ -87,8 +86,7 @@ if(BUILD_TESTS)
     test/cluster/linkage.cu
     test/cluster/kmeans_find_k.cu
     OPTIONAL
-    DIST
-    NN
+    LIB
   )
 
   ConfigureTest(
@@ -140,7 +138,7 @@ if(BUILD_TESTS)
     test/distance/fused_l2_nn.cu
     test/distance/gram.cu
     OPTIONAL
-    DIST
+    LIB
   )
 
   ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
@@ -201,7 +199,7 @@ if(BUILD_TESTS)
     test/matrix/triangular.cu
     test/sparse/spectral_matrix.cu
     OPTIONAL
-    DIST
+    LIB
   )
 
   ConfigureTest(
@@ -221,7 +219,7 @@ if(BUILD_TESTS)
 
   ConfigureTest(
     NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-    test/lap/lap.cu test/sparse/mst.cu OPTIONAL DIST
+    test/lap/lap.cu test/sparse/mst.cu OPTIONAL LIB
   )
 
   ConfigureTest(
@@ -245,13 +243,12 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu OPTIONAL DIST
-    NN
+    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu OPTIONAL LIB
   )
 
   ConfigureTest(
     NAME SPARSE_NEIGHBORS_TEST PATH test/sparse/neighbors/connect_components.cu
-    test/sparse/neighbors/brute_force.cu test/sparse/neighbors/knn_graph.cu OPTIONAL DIST NN
+    test/sparse/neighbors/brute_force.cu test/sparse/neighbors/knn_graph.cu OPTIONAL LIB
   )
 
   ConfigureTest(
@@ -271,13 +268,11 @@ if(BUILD_TESTS)
     test/neighbors/tiled_knn.cu
     test/neighbors/haversine.cu
     test/neighbors/ball_cover.cu
-    test/neighbors/faiss_mr.cu
     test/neighbors/epsilon_neighborhood.cu
     test/neighbors/refine.cu
     test/neighbors/selection.cu
     OPTIONAL
-    DIST
-    NN
+    LIB
   )
 
   ConfigureTest(
@@ -310,8 +305,7 @@ if(BUILD_TESTS)
     test/stats/weighted_mean.cu
     test/stats/v_measure.cu
     OPTIONAL
-    DIST
-    NN
+    LIB
   )
 
   ConfigureTest(
diff --git a/cpp/test/cluster/cluster_solvers.cu b/cpp/test/cluster/cluster_solvers.cu
index 5121cdf139..f26c598a2b 100644
--- a/cpp/test/cluster/cluster_solvers.cu
+++ b/cpp/test/cluster/cluster_solvers.cu
@@ -19,7 +19,7 @@
 #include <memory>
 #include <raft/core/device_resources.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/spectral/specializations.cuh>
 #endif
 
diff --git a/cpp/test/cluster/kmeans.cu b/cpp/test/cluster/kmeans.cu
index 3e2153dcde..cfec84256b 100644
--- a/cpp/test/cluster/kmeans.cu
+++ b/cpp/test/cluster/kmeans.cu
@@ -29,7 +29,7 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/cluster/specializations.cuh>
 #endif
 
diff --git a/cpp/test/cluster/kmeans_balanced.cu b/cpp/test/cluster/kmeans_balanced.cu
index ae06572061..220eba4186 100644
--- a/cpp/test/cluster/kmeans_balanced.cu
+++ b/cpp/test/cluster/kmeans_balanced.cu
@@ -30,7 +30,7 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/cluster/specializations.cuh>
 #endif
 
diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu
index e80cbaa93b..a865651f56 100644
--- a/cpp/test/cluster/kmeans_find_k.cu
+++ b/cpp/test/cluster/kmeans_find_k.cu
@@ -25,7 +25,7 @@
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/cluster/specializations.cuh>
 #endif
 
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 20f2952e7d..4946d52f26 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -20,8 +20,8 @@
 #include <raft/linalg/transpose.cuh>
 #include <raft/sparse/coo.hpp>
 
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#if defined RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <raft/core/device_mdspan.hpp>
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 5fcaf07539..438e212fbd 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -16,16 +16,24 @@
 
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
-#include <raft/common/nvtx.hpp>
-#include <raft/core/mdarray.hpp>
-#include <raft/core/operators.hpp>
+#include <raft/common/nvtx.hpp>  // common::nvtx::range
+
+#include <raft/core/device_mdspan.hpp>       // make_device_matrix_view
+#include <raft/core/device_resources.hpp>    // raft::device_resources
+#include <raft/core/operators.hpp>           // raft::sqrt
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>  // rmm::device_uvector
+
+// When the distance library is precompiled, include only the raft_runtime
+// headers. This way, a small change in one of the kernel internals does not
+// trigger a rebuild of the test files (it of course still triggers a rebuild of
+// the raft specializations)
+#if defined RAFT_COMPILED
+#include <raft_runtime/distance/pairwise_distance.hpp>
+#else
 #include <raft/distance/distance.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/cudart_utils.hpp>
-#if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.cuh>
 #endif
-#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace distance {
@@ -409,6 +417,25 @@ template <typename DataType>
   return os;
 }
 
+// TODO: Remove when mdspan-based raft::runtime::distance::pairwise_distance is
+// implemented.
+//
+// Context:
+// https://github.com/rapidsai/raft/issues/1338
+template <typename layout>
+constexpr bool layout_to_row_major();
+
+template <>
+constexpr bool layout_to_row_major<layout_c_contiguous>()
+{
+  return true;
+}
+template <>
+constexpr bool layout_to_row_major<layout_f_contiguous>()
+{
+  return false;
+}
+
 template <raft::distance::DistanceType distanceType, typename DataType, typename layout>
 void distanceLauncher(raft::device_resources const& handle,
                       DataType* x,
@@ -422,12 +449,23 @@ void distanceLauncher(raft::device_resources const& handle,
                       DataType threshold,
                       DataType metric_arg = 2.0f)
 {
+#if defined RAFT_COMPILED
+  // TODO: Implement and use mdspan-based
+  // raft::runtime::distance::pairwise_distance here.
+  //
+  // Context:
+  // https://github.com/rapidsai/raft/issues/1338
+  bool row_major = layout_to_row_major<layout>();
+  raft::runtime::distance::pairwise_distance(
+    handle, x, y, dist, m, n, k, distanceType, row_major, metric_arg);
+#else
   auto x_v    = make_device_matrix_view<DataType, int, layout>(x, m, k);
   auto y_v    = make_device_matrix_view<DataType, int, layout>(y, n, k);
   auto dist_v = make_device_matrix_view<DataType, int, layout>(dist, m, n);
 
   raft::distance::distance<distanceType, DataType, DataType, DataType, layout>(
     handle, x_v, y_v, dist_v, metric_arg);
+#endif
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
@@ -523,9 +561,25 @@ class BigMatrixDistanceTest : public ::testing::Test {
     auto testInfo = testing::UnitTest::GetInstance()->current_test_info();
     common::nvtx::range fun_scope("test::%s/%s", testInfo->test_suite_name(), testInfo->name());
 
+    void pairwise_distance(raft::device_resources const& handle,
+                           float* x,
+                           float* y,
+                           float* dists,
+                           int m,
+                           int n,
+                           int k,
+                           raft::distance::DistanceType metric,
+                           bool isRowMajor,
+                           float metric_arg);
+    constexpr bool row_major   = true;
+    constexpr float metric_arg = 0.0f;
+#if defined RAFT_COMPILED
+    raft::runtime::distance::pairwise_distance(
+      handle, x.data(), x.data(), dist.data(), m, n, k, distanceType, row_major, metric_arg);
+#else
     raft::distance::distance<distanceType, float, float, float>(
-      handle, x.data(), x.data(), dist.data(), m, n, k, true, 0.0f);
-
+      handle, x.data(), x.data(), dist.data(), m, n, k, row_major, metric_arg);
+#endif
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index af67214193..383ad39319 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -24,7 +24,7 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 
@@ -182,22 +182,20 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    MinAndDistanceReduceOp<int, DataT> redOp;
-    fusedL2NN<DataT, raft::KeyValuePair<int, DataT>, int>(
-      out,
-      x.data(),
-      y.data(),
-      xn.data(),
-      yn.data(),
-      m,
-      n,
-      k,
-      (void*)workspace.data(),
-      redOp,
-      raft::distance::KVPMinReduce<int, DataT>(),
-      Sqrt,
-      true,
-      stream);
+
+    const bool init_out_buffer = true;
+    fusedL2NNMinReduce<DataT, raft::KeyValuePair<int, DataT>, int>(out,
+                                                                   x.data(),
+                                                                   y.data(),
+                                                                   xn.data(),
+                                                                   yn.data(),
+                                                                   m,
+                                                                   n,
+                                                                   k,
+                                                                   (void*)workspace.data(),
+                                                                   Sqrt,
+                                                                   init_out_buffer,
+                                                                   stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 };
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index a2f0e2385c..f99d02dc7f 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index 6f7d8bf44a..d01911206b 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -28,7 +28,7 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
-#ifdef RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
 
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index a9fc4c8f40..2a40d70abc 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -18,7 +18,7 @@
 
 #include <raft_internal/matrix/select_k.cuh>
 
-#ifdef RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/matrix/specializations.cuh>
 #endif
 
@@ -332,6 +332,7 @@ INSTANTIATE_TEST_CASE_P(                // NOLINT
                    testing::Values(select::Algo::kPublicApi,
                                    select::Algo::kRadix8bits,
                                    select::Algo::kRadix11bits,
+                                   select::Algo::kRadix11bitsExtraPass,
                                    select::Algo::kWarpImmediate,
                                    select::Algo::kWarpFiltered,
                                    select::Algo::kWarpDistributed)));
@@ -426,6 +427,7 @@ INSTANTIATE_TEST_CASE_P(                          // NOLINT
   testing::Combine(inputs_random_longlist,
                    testing::Values(select::Algo::kRadix8bits,
                                    select::Algo::kRadix11bits,
+                                   select::Algo::kRadix11bitsExtraPass,
                                    select::Algo::kWarpImmediate,
                                    select::Algo::kWarpFiltered,
                                    select::Algo::kWarpDistributed,
@@ -440,6 +442,7 @@ INSTANTIATE_TEST_CASE_P(                             // NOLINT
   testing::Combine(inputs_random_longlist,
                    testing::Values(select::Algo::kRadix8bits,
                                    select::Algo::kRadix11bits,
+                                   select::Algo::kRadix11bitsExtraPass,
                                    select::Algo::kWarpImmediate,
                                    select::Algo::kWarpFiltered,
                                    select::Algo::kWarpDistributed,
@@ -451,7 +454,11 @@ TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); }  // NOLINT
 INSTANTIATE_TEST_CASE_P(                                 // NOLINT
   SelectK,
   ReferencedRandomDoubleInt,
-  testing::Combine(inputs_random_largesize, testing::Values(select::Algo::kWarpAuto)));
+  testing::Combine(inputs_random_largesize,
+                   testing::Values(select::Algo::kWarpAuto,
+                                   select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kRadix11bitsExtraPass)));
 
 using ReferencedRandomFloatSizeT =
   SelectK<float, uint64_t, with_ref<select::Algo::kRadix8bits>::params_random>;
@@ -459,6 +466,7 @@ TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); }  // NOLINT
 INSTANTIATE_TEST_CASE_P(SelectK,                       // NOLINT
                         ReferencedRandomFloatSizeT,
                         testing::Combine(inputs_random_largek,
-                                         testing::Values(select::Algo::kRadix11bits)));
+                                         testing::Values(select::Algo::kRadix11bits,
+                                                         select::Algo::kRadix11bitsExtraPass)));
 
 }  // namespace raft::matrix
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 486ff61724..fe6f9163a0 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -36,7 +36,7 @@
 
 #include <thrust/sequence.h>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index cee0d03c99..e430af89df 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
index 95876f9165..e4e7a207fb 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
index ebee20c2b6..ef7980401a 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
@@ -18,7 +18,7 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index c368192b03..c331081314 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -24,7 +24,7 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/random/rng.cuh>
-#ifdef RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #else
 #pragma message("NN specializations are not enabled; expect very long building times.")
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
index 6dcae8e34d..46ef3a9150 100644
--- a/cpp/test/neighbors/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -22,8 +22,9 @@
 #include <raft/neighbors/brute_force.cuh>
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <rmm/device_uvector.hpp>
@@ -120,7 +121,6 @@ void compute_bfknn(const raft::device_resources& handle,
                                     make_device_matrix_view(X2, n_query_rows, d),
                                     make_device_matrix_view(inds, n_query_rows, k),
                                     make_device_matrix_view(dists, n_query_rows, k),
-                                    k,
                                     metric);
 }
 
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu
index 977e8f3ce8..769cb7ec2d 100644
--- a/cpp/test/neighbors/epsilon_neighborhood.cu
+++ b/cpp/test/neighbors/epsilon_neighborhood.cu
@@ -23,7 +23,7 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
-#ifdef RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/faiss_mr.cu b/cpp/test/neighbors/faiss_mr.cu
deleted file mode 100644
index 89f012db0f..0000000000
--- a/cpp/test/neighbors/faiss_mr.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../test_utils.cuh"
-
-#include <faiss/gpu/GpuResources.h>
-#include <raft/distance/distance_types.hpp>
-#include <raft/spatial/knn/faiss_mr.hpp>
-#include <raft/spatial/knn/knn.cuh>
-
-#include <rmm/device_buffer.hpp>
-
-#include <gtest/gtest.h>
-
-#include <cstddef>
-#include <iostream>
-#include <vector>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-using namespace faiss::gpu;
-
-struct AllocInputs {
-  size_t size;
-};
-
-template <typename T>
-class FAISS_MR_Test : public ::testing::TestWithParam<AllocInputs> {
- public:
-  FAISS_MR_Test()
-    : params_(::testing::TestWithParam<AllocInputs>::GetParam()), stream(handle.get_stream())
-  {
-  }
-
- protected:
-  size_t getFreeMemory(MemorySpace mem_space)
-  {
-    if (mem_space == MemorySpace::Device) {
-      rmm::mr::cuda_memory_resource cmr;
-      rmm::mr::device_memory_resource* dmr = &cmr;
-      return dmr->get_mem_info(stream).first;
-    } else if (mem_space == MemorySpace::Unified) {
-      rmm::mr::managed_memory_resource mmr;
-      rmm::mr::device_memory_resource* dmr = &mmr;
-      return dmr->get_mem_info(stream).first;
-    }
-    return 0;
-  }
-
-  void testAllocs(MemorySpace mem_space)
-  {
-    raft::spatial::knn::RmmGpuResources faiss_mr;
-    auto faiss_mr_impl = faiss_mr.getResources();
-    size_t free_before = getFreeMemory(mem_space);
-    AllocRequest req(AllocType::Other, 0, mem_space, stream, params_.size);
-    void* ptr               = faiss_mr_impl->allocMemory(req);
-    size_t free_after_alloc = getFreeMemory(mem_space);
-    faiss_mr_impl->deallocMemory(0, ptr);
-    ASSERT_TRUE(free_after_alloc <= free_before - params_.size);
-  }
-
-  raft::device_resources handle;
-  cudaStream_t stream;
-  AllocInputs params_;
-};
-
-const std::vector<AllocInputs> inputs = {{19687}};
-
-typedef FAISS_MR_Test<float> FAISS_MR_TestF;
-TEST_P(FAISS_MR_TestF, TestAllocs)
-{
-  testAllocs(MemorySpace::Device);
-  testAllocs(MemorySpace::Unified);
-}
-
-INSTANTIATE_TEST_CASE_P(FAISS_MR_Test, FAISS_MR_TestF, ::testing::ValuesIn(inputs));
-
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
index a5fead8093..ab05b41cc9 100644
--- a/cpp/test/neighbors/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -23,12 +23,8 @@
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
-#ifdef RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.cuh>
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <raft/distance/distance.cuh>
diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu
index 7976725c65..bcd4b9cb0b 100644
--- a/cpp/test/neighbors/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -21,14 +21,10 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/brute_force.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED
+#ifdef RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>
@@ -100,7 +96,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       raft::make_device_matrix_view<T, IdxT, row_major>(distances_.data(), rows_, k_);
 
     auto metric = raft::distance::DistanceType::L2Unexpanded;
-    knn(handle, index, search, indices, distances, k_, metric, std::make_optional<IdxT>(0));
+    knn(handle, index, search, indices, distances, metric, std::make_optional<IdxT>(0));
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data());
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index 8866c404a9..dd3491673e 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -31,7 +31,7 @@
 
 #include <gtest/gtest.h>
 
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index 25939f65c3..9f13de357c 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -24,7 +24,7 @@
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.cuh>
-#if defined RAFT_DISTANCE_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/neighbors/tiled_knn.cu b/cpp/test/neighbors/tiled_knn.cu
index 4784f915f3..ccc3a64edd 100644
--- a/cpp/test/neighbors/tiled_knn.cu
+++ b/cpp/test/neighbors/tiled_knn.cu
@@ -25,8 +25,7 @@
 #include <raft/matrix/init.cuh>
 #include <raft/neighbors/brute_force.cuh>
 
-#if defined RAFT_NN_COMPILED
-#include <raft/distance/specializations.cuh>
+#if defined RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
diff --git a/cpp/test/sparse/neighbors/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
index 3b025fc082..8873445c37 100644
--- a/cpp/test/sparse/neighbors/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -22,8 +22,8 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/neighbors/knn_graph.cuh>
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
+#if defined RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #include <iostream>
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 80e60a4884..40b7e59d81 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -20,7 +20,7 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/stats/specializations.cuh>
 #endif
 
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index a2f72516eb..2fde6b29c1 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -20,7 +20,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
+#if defined RAFT_COMPILED
 #include <raft/stats/specializations.cuh>
 #endif
 
diff --git a/dependencies.yaml b/dependencies.yaml
index e920141a79..0460e2dd81 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,11 +7,24 @@ files:
       arch: [x86_64]
     includes:
       - build
+      - build_pylibraft
       - cudatoolkit
       - develop
       - docs
-      - run
-      - test_python
+      - run_raft_dask
+      - run_pylibraft
+      - test_python_common
+      - test_pylibraft
+  bench_ann:
+    output: conda
+    matrix:
+      cuda: ["11.8"]
+      arch: [x86_64]
+    includes:
+      - build
+      - develop
+      - cudatoolkit
+      - nn_bench
   test_cpp:
     output: none
     includes:
@@ -21,7 +34,8 @@ files:
     includes:
       - cudatoolkit
       - py_version
-      - test_python
+      - test_python_common
+      - test_pylibraft
   checks:
     output: none
     includes:
@@ -33,6 +47,54 @@ files:
       - cudatoolkit
       - docs
       - py_version
+  py_build_pylibraft:
+    output: pyproject
+    pyproject_dir: python/pylibraft
+    extras:
+      table: build-system
+    includes:
+      - build
+      - build_pylibraft
+      - build_wheels
+  py_run_pylibraft:
+    output: pyproject
+    pyproject_dir: python/pylibraft
+    extras:
+      table: project
+    includes:
+      - run_pylibraft
+  py_test_pylibraft:
+    output: pyproject
+    pyproject_dir: python/pylibraft
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_pylibraft
+  py_build_raft_dask:
+    output: pyproject
+    pyproject_dir: python/raft-dask
+    extras:
+      table: build-system
+    includes:
+      - build
+      - build_wheels
+  py_run_raft_dask:
+    output: pyproject
+    pyproject_dir: python/raft-dask
+    extras:
+      table: project
+    includes:
+      - run_raft_dask
+  py_test_raft_dask:
+    output: pyproject
+    pyproject_dir: python/raft-dask
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
 channels:
   - rapidsai
   - rapidsai-nightly
@@ -42,10 +104,9 @@ channels:
 dependencies:
   build:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - cmake>=3.23.1,!=3.25.0
-          - cuda-python >=11.7.1,<12.0
           - cython>=0.29,<0.30
           - ninja
           - scikit-build>=0.13.1
@@ -53,6 +114,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
+          - nccl>=2.9.9
     specific:
       - output_types: conda
         matrices:
@@ -66,6 +128,12 @@ dependencies:
             packages:
               - gcc_linux-aarch64=11.*
               - sysroot_linux-aarch64==2.17
+  build_pylibraft:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - &cuda_python cuda-python >=11.7.1,<12.0
+          - &rmm rmm==23.6.*
   checks:
     common:
       - output_types: [conda, requirements]
@@ -79,6 +147,17 @@ dependencies:
       - output_types: [conda]
         packages:
           - clang-tools=11.1.0
+  nn_bench:
+    common:
+      - output_types: [conda]
+        packages:
+          - hnswlib=0.7.0
+          - nlohmann_json>=3.11.2
+          - glog>=0.6.0
+          - h5py>=3.8.0
+          - libfaiss>=1.7.1
+          - faiss-proc=*=cuda
+
   cudatoolkit:
     specific:
       - output_types: conda
@@ -150,6 +229,12 @@ dependencies:
           - recommonmark
           - sphinx-copybutton
           - sphinx-markdown-tables
+  build_wheels:
+    common:
+      - output_types: pyproject
+        packages:
+          - wheel
+          - setuptools
   py_version:
     specific:
       - output_types: conda
@@ -169,25 +254,41 @@ dependencies:
           - matrix:
             packages:
               - python>=3.8,<3.11
-  run:
+  run_pylibraft:
     common:
-      - output_types: [conda]
+      - output_types: [conda, pyproject]
+        packages:
+          - &numpy numpy>=1.21
+          - *cuda_python
+          - *rmm
+  run_raft_dask:
+    common:
+      - output_types: [conda, pyproject]
         packages:
           - dask>=2023.1.1
+          - dask-cuda==23.6.*
           - distributed>=2023.1.1
+          - joblib>=0.11
+          - numba>=0.49
+          - *numpy
+          - ucx-py==0.32.*
+      - output_types: conda
+        packages:
           - ucx>=1.13.0
-          - ucx-py=0.32.*
           - ucx-proc=*=gpu
-          - rmm=23.06
-          - libfaiss>=1.7.1=cuda*
-          - faiss-proc=*=cuda
-          - dask-cuda=23.06
-  test_python:
+      - output_types: pyproject
+        packages:
+          - pylibraft==23.6.*
+  test_python_common:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cupy
           - pytest
           - pytest-cov
+  test_pylibraft:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cupy
           - scikit-learn
           - scipy
diff --git a/docs/source/build.md b/docs/source/build.md
index 29d0a72a37..6c9978a11f 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -4,8 +4,7 @@
 
 The easiest way to install RAFT is through conda and several packages are provided.
 - `libraft-headers` RAFT headers
-- `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
-- `libraft-distance` (optional) contains shared libraries for distance primitives.
+- `libraft` (optional) shared library containing pre-compiled template specializations and runtime API.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 
@@ -14,7 +13,7 @@ Use the following command to install all of the RAFT packages with conda (replac
 mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft
 ```
 
-You can also install the `libraft-*` conda packages individually using the `mamba` command above.
+You can also install the conda packages individually using the `mamba` command above.
 
 After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed.
 
@@ -22,8 +21,8 @@ After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used i
 
 pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install):
 ```bash
-pip install pylibraft-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-pip install raft-dask-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install pylibraft-cu11 --extra-index-url=https://pypi.nvidia.com
+pip install raft-dask-cu11 --extra-index-url=https://pypi.nvidia.com
 ```
 
 ## Building and installing RAFT
@@ -42,12 +41,10 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth
 #### Required
 - [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version.
 - [Thrust](https://github.com/NVIDIA/thrust) v1.17 / [CUB](https://github.com/NVIDIA/cub)
-
-#### Optional
 - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API.
-- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0 - Used by cuCollections
 - [CUTLASS](https://github.com/NVIDIA/cutlass)  v2.9.1 - Used in `raft::distance` API.
-- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::neighbors` API.
+
+#### Optional
 - [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `raft-dask`.
 - [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `raft-dask`.
 - [Googletest](https://github.com/google/googletest) - Needed to build tests
@@ -60,14 +57,14 @@ The recommended way to build and install RAFT is to use the `build.sh` script in
 
 ### Header-only C++
 
-`build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
+`build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like CUTLASS, which will need to be explicitly enabled in `build.sh`.
 
 The following example will download the needed dependencies and install the RAFT headers into `$INSTALL_PREFIX/include/raft`. 
 ```bash
 ./build.sh libraft
 
 ```
-The `-n` flag can be passed to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build, with the exception of building FAISS statically into the shared libraries.
+The `-n` flag can be passed to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build.
 ```bash
 ./build.sh libraft -n
 ```
@@ -78,21 +75,16 @@ Once installed, `libraft` headers (and dependencies which were downloaded and in
 ```
 
 
-### C++ Shared Libraries (optional)
-
-For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
-```bash
-./build.sh libraft --compile-libs
-```
+### C++ Shared Library (optional)
 
-Individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
+A shared library can be built for speeding up compile times. The shared library also contains a runtime API that allows you to invoke RAFT APIs directly from C++ source files (without `nvcc`). The shared library can also significantly improve re-compile times both while developing RAFT and using its APIs to develop applications. Pass the `--compile-lib` flag to `build.sh` to build the library:
 ```bash
-./build.sh libraft --compile-nn --compile-dist
+./build.sh libraft --compile-lib
 ```
 
-In above example the shared libraries are installed by default into `$INSTALL_PREFIX/lib`. To disable this, pass `-n` flag.
+In above example the shared library is installed by default into `$INSTALL_PREFIX/lib`. To disable this, pass `-n` flag.
 
-Once installed, the shared libraries, headers (and any dependencies downloaded and installed via `rapids-cmake`) can be uninstalled using `build.sh`:
+Once installed, the shared library, headers (and any dependencies downloaded and installed via `rapids-cmake`) can be uninstalled using `build.sh`:
 ```bash
 ./build.sh libraft --uninstall
 ```
@@ -117,7 +109,7 @@ Compile the tests using the `tests` target in `build.sh`.
 Test compile times can be improved significantly by using the optional shared libraries. If installed, they will be used automatically when building the tests but `--compile-libs` can be used to add additional compilation units and compile them with the tests.
 
 ```bash
-./build.sh libraft tests --compile-libs
+./build.sh libraft tests --compile-lib
 ```
 
 The tests are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_TEST`.
@@ -153,29 +145,25 @@ Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet
 cd cpp
 mkdir build
 cd build
-cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_ENABLE_NN_DEPENDENCIES=ON  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
+cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARY=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
 make -j<parallel_level> install
 ```
 
 RAFT's cmake has the following configurable flags available:.
 
-| Flag | Possible Values | Default Value | Behavior |
-| --- | --- | --- | --- |
-| BUILD_TESTS | ON, OFF | ON | Compile Googletests |
-| BUILD_BENCH | ON, OFF | OFF | Compile benchmarks |
-| raft_FIND_COMPONENTS | nn distance | | Configures the optional components as a space-separated list |
-| RAFT_COMPILE_LIBRARIES | ON, OFF | ON if either BUILD_TESTS or BUILD_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_COMPILE_NN_LIBRARY | ON, OFF | OFF | Compiles the `libraft-nn` shared library |
-| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library |
-| RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
-| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` |
-| DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
-| RAFT_NVTX | ON, OFF | OFF | Enable NVTX Markers |
-| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
-| CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
-| CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
-
-Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
+| Flag                      | Possible Values      | Default Value | Behavior |
+|---------------------------|----------------------| --- | --- |
+| BUILD_TESTS               | ON, OFF              | ON | Compile Googletests |
+| BUILD_BENCH               | ON, OFF              | OFF | Compile benchmarks |
+| raft_FIND_COMPONENTS      | compiled distributed | | Configures the optional components as a space-separated list |
+| RAFT_COMPILE_LIBRARY      | ON, OFF              | ON if either BUILD_TESTS or BUILD_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
+| DETECT_CONDA_ENV          | ON, OFF              | ON | Enable detection of conda environment for dependencies |
+| RAFT_NVTX                 | ON, OFF              | OFF | Enable NVTX Markers |
+| CUDA_ENABLE_KERNELINFO    | ON, OFF              | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
+| CUDA_ENABLE_LINEINFO      | ON, OFF              | OFF | Enable the -lineinfo option for nvcc |
+| CUDA_STATIC_RUNTIME       | ON, OFF              | OFF | Statically link the CUDA runtime |
+
+Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components.
 
 ### Python
 
@@ -190,9 +178,9 @@ The Python APIs can be built and installed using the `build.sh` script:
 
 ```bash
 # to build pylibraft
-./build.sh libraft pylibraft --compile-libs
+./build.sh libraft pylibraft --compile-lib
 # to build raft-dask
-./build.sh libraft raft-dask --compile-libs
+./build.sh libraft pylibraft raft-dask --compile-lib
 ```
 
 `setup.py` can also be used to build the Python APIs manually:
@@ -228,7 +216,7 @@ The documentation requires that the C++ headers and python packages have been bu
 The following will build the docs along with the C++ and Python packages:
 
 ```
-./build.sh libraft pylibraft raft-dask docs --compile-libs
+./build.sh libraft pylibraft raft-dask docs --compile-lib
 ```
 
 
@@ -258,9 +246,9 @@ PROPERTIES CXX_STANDARD                        17
 ```
 
 
-### C++ header-only integration
+### C++ header-only integration (without cmake)
 
-When the needed [build dependencies](#build-dependencies) are already satisfied, RAFT can be trivially integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
+While not a highly suggested method for building against RAFT, when all of the needed [build dependencies](#build-dependencies) are already satisfied, RAFT can be integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
 ```cmake
 set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
 ExternalProject_Add(raft
@@ -272,26 +260,32 @@ ExternalProject_Add(raft
   INSTALL_COMMAND   "")
 set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/raft/cpp/include CACHE STRING "RAFT include variable")
 ```
+### C++ header-only integration (with cmake)
+
+
+When using cmake, you can install RAFT headers into your environment with `./build.sh libraft`. 
 
-If RAFT has already been installed, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target.
+If the RAFT headers have already been installed into your environment with cmake or through conda, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target.
 
 ### Using C++ pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
+Use `find_package(raft COMPONENTS compiled distributed)` to enable the shared library and transitively pass dependencies through separate targets for each component. In this example, the `raft::compiled` and `raft::distributed` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as NCCL for the `distributed` component).
 
 The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.cuh` and located in the base directory for the packages that contain specializations.
 
-The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
+The following example tells the compiler to ignore the pre-compiled templates for the `raft::distance` API so any symbols already compiled into the `libraft` shared library will be used instead. RAFT's cmake creates a variable `RAFT_COMPILED` which can be used to ignore the pre-compiled template specializations only when the shared library has been enabled through cmake (such as by specifying the `compiled` component in `find_package`):
 ```c++
+#ifdef RAFT_COMPILED
 #include <raft/distance/distance.cuh>
 #include <raft/distance/specializations.cuh>
+#endif
 ```
 
 ### Building RAFT C++ from source in cmake
 
 RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake).
 
-The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARIES` option enables the building the shared libraries.
+The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARY` option enables the building the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
@@ -302,34 +296,10 @@ set(RAFT_FORK "rapidsai")
 set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 
 function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
-          COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN
-          USE_NN_LIBRARY USE_DISTANCE_LIBRARY
-          ENABLE_thrust_DEPENDENCY)
+  set(oneValueArgs VERSION FORK PINNED_TAG COMPILE_LIBRARY)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
-
-  #-----------------------------------------------------
-  # Clone RAFT locally if PINNED_TAG has been changed
-  #-----------------------------------------------------
-  if(PKG_CLONE_ON_PIN AND NOT PKG_PINNED_TAG STREQUAL "branch-${RAFT_VERSION}")
-    message("Pinned tag found: ${PKG_PINNED_TAG}. Cloning raft locally.")
-    set(CPM_DOWNLOAD_raft ON)
-    set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_PREFIX}/include/raft;${CMAKE_IGNORE_PATH})
-  endif()
-
-  #-----------------------------------------------------
-  # Add components
-  #-----------------------------------------------------
-
-  if(PKG_USE_NN_LIBRARY)
-    string(APPEND RAFT_COMPONENTS " nn")
-  endif()
-
-  if(PKG_USE_DISTANCE_LIBRARY)
-    string(APPEND RAFT_COMPONENTS " distance")
-  endif()
-
+  
   #-----------------------------------------------------
   # Invoke CPM find_package()
   #-----------------------------------------------------
@@ -342,14 +312,11 @@ function(find_and_configure_raft)
           GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
           GIT_TAG        ${PKG_PINNED_TAG}
           SOURCE_SUBDIR  cpp
-          FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
+          FIND_PACKAGE_ARGUMENTS "COMPONENTS compiled distributed"
           OPTIONS
           "BUILD_TESTS OFF"
           "BUILD_BENCH OFF"
-          "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
-          "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
-          "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
-          "RAFT_ENABLE_thrust_DEPENDENCY ${PKG_ENABLE_thrust_DEPENDENCY}"
+          "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_LIBRARY}"
   )
 
 endfunction()
@@ -360,22 +327,11 @@ endfunction()
 find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
         FORK             ${RAFT_FORK}
         PINNED_TAG       ${RAFT_PINNED_TAG}
-
-        # When PINNED_TAG above doesn't match cuml,
-        # force local raft clone in build directory
-        # even if it's already installed.
-        CLONE_ON_PIN     ON
-
-        COMPILE_LIBRARIES        NO
-        USE_NN_LIBRARY           NO
-        USE_DISTANCE_LIBRARY     NO
-        ENABLE_NN_DEPENDENCIES   NO  # This builds FAISS if not installed
-        USE_FAISS_STATIC         NO
-        ENABLE_thrust_DEPENDENCY YES
+        COMPILE_LIBRARY          NO
 )
 ```
 
-If using the nearest neighbors APIs without the shared libraries, set `ENABLE_NN_DEPENDENCIES=ON` and keep `USE_NN_LIBRARY=OFF`
+You can find a fully-functioning [example template project](../../cpp/template/README.md) in the `cpp/template` directory, which provides everything you need to build a new application with RAFT or incorporate RAFT Into your existing libraries.
 
 ## Uninstall
 
diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md
new file mode 100644
index 0000000000..708f5f7dba
--- /dev/null
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -0,0 +1,322 @@
+# CUDA ANN Benchmarks
+
+This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU.
+
+## Benchmark
+
+### Dependencies
+
+CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. 
+
+Please refer to the  [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. 
+
+In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include:
+1. FAISS GPU >= 1.7.1
+2. Google Logging (GLog)
+3. H5Py
+4. HNSWLib
+5. nlohmann_json
+6. GGNN
+
+[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically.
+
+The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks:
+
+```bash
+mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+conda activate raft_ann_benchmarks
+```
+
+The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`.
+
+### Compiling the Benchmarks
+
+After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms:
+```bash
+./build.sh bench-ann
+```
+
+You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`):
+```bash
+./build.sh bench-ann --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH
+```
+
+Available targets to use with `--limit-bench-ann` are:
+- FAISS_IVF_FLAT_ANN_BENCH
+- FAISS_IVF_PQ_ANN_BENCH
+- FAISS_BFKNN_ANN_BENCH
+- GGNN_ANN_BENCH
+- HNSWLIB_ANN_BENCH
+- RAFT_IVF_PQ_ANN_BENCH
+- RAFT_IVF_FLAT_ANN_BENCH
+- RAFT_BFKNN_ANN_BENCH
+
+By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported.
+
+### Usage
+There are 4 general steps to running the benchmarks:
+1. Prepare Dataset
+2. Build Index
+3. Search Using Built Index
+4. Evaluate Result
+
+#### End-to-end Example
+An end-to-end example (run from the RAFT source code root directory):
+```bash
+# (1) prepare a dataset
+pushd
+
+cd cpp/bench/ann
+mkdir data && cd data
+wget http://ann-benchmarks.com/glove-100-angular.hdf5
+
+# option -n is used here to normalize vectors so cosine distance is converted
+# to inner product; don't use -n for l2 distance
+python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5
+
+mkdir glove-100-inner
+mv glove-100-angular.base.fbin glove-100-inner/base.fbin
+mv glove-100-angular.query.fbin glove-100-inner/query.fbin
+mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin
+mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin
+popd
+
+# (2) build index
+./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json
+
+# (3) search
+./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json
+
+# (4) evaluate result
+pushd
+cd cpp/bench/ann
+./scripts/eval.pl \
+  -o result.csv \
+  data/glove-100-inner/groundtruth.neighbors.ibin \
+  result/glove-100-inner/faiss_ivf_flat
+popd 
+
+# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool
+```
+
+##### Step 1: Prepare Dataset
+A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation.
+
+The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
+These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
+
+Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+
+Commonly used datasets can be downloaded from two websites:
+1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
+
+    However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    ```bash
+    pip3 install numpy h5py
+    ```
+    The usage of this script is:
+    ```bash
+    $ cpp/bench/ann/scripts/hdf5_to_fbin.py
+    usage: scripts/hdf5_to_fbin.py [-n] <input>.hdf5
+       -n: normalize base/query set
+     outputs: <input>.base.fbin
+              <input>.query.fbin
+              <input>.groundtruth.neighbors.ibin
+              <input>.groundtruth.distances.fbin
+    ```
+    So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset.
+
+    Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset.
+
+2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
+    ```bash
+    $ cpp/bench/ann/scripts/split_groundtruth.pl
+    usage: script/split_groundtruth.pl input output_prefix
+    ```
+    Take Deep-1B dataset as an example:
+    ```bash
+    pushd
+    cd cpp/bench/ann
+    mkdir -p data/deep-1B && cd data/deep-1B
+    # download manually "Ground Truth" file of "Yandex DEEP"
+    # suppose the file name is deep_new_groundtruth.public.10K.bin
+    ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
+    popd
+    ```
+    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
+
+
+##### Step 2: Build Index
+An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk.
+
+To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
+* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
+    - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset.
+* `search_basic_param` section specifies basic parameters for searching:
+    - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching.
+    -  `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs.
+* `index` section specifies an array of configurations for index building and searching:
+    - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values.
+    - `file` is the file name of index. Building will save built index to this file, while searching will load this file.
+    - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files.
+    - if `multigpu` is specified, multiple GPUs will be used for index build and search.
+    - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept.
+
+
+The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables:
+```bash
+$ ./cpp/build/*_ANN_BENCH -h
+usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
+   -b: build mode, will build index
+   -s: search mode, will search using built index
+       one and only one of -b and -s should be specified
+   -f: force overwriting existing output files
+   -i: by default will build/search all the indices found in conf.json
+       '-i' can be used to select a subset of indices
+       'index_names' is a list of comma-separated index names
+       '*' is allowed as the last character of a name to select all matched indices
+       for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss"
+```
+* `-b`: build index.
+* `-s`: do the searching with built index.
+* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option.
+* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option.
+
+It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains:
+```json
+  "index" : [
+    {
+      "name" : "hnsw1",
+      ...
+    },
+    {
+      "name" : "hnsw1",
+      ...
+    },
+    {
+      "name" : "faiss",
+      ...
+    }
+  ]
+```
+Then,
+```bash
+# build all indices: hnsw1, hnsw2 and faiss
+./cpp/build/HNSWLIB_ANN_BENCH -b a.json
+
+# build only hnsw1
+./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json
+
+# build hnsw1 and hnsw2
+./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json
+
+# build hnsw1 and hnsw2
+./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json
+
+# build faiss
+./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json
+```
+In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell.
+
+
+##### Step 3: Searching
+Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2.
+
+
+##### Step 4: Evaluating Results
+Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is:
+```bash
+$ cpp/bench/ann/scripts/eval.pl
+usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
+  result_paths... are paths to the search result files.
+    Can specify multiple paths.
+    For each of them, if it's a directory, all the .txt files found under
+    it recursively will be regarded as inputs.
+
+  -f: force to recompute recall and update it in result file if needed
+  -o: also write result to a csv file
+```
+Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files.
+An example:
+```bash
+cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \
+  result/glove-100-angular/10/hnsw/angular_M_24_*.txt \
+  result/glove-100-angular/10/faiss/
+```
+The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively.
+
+This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively.
+
+It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o <output.csv>` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves.
+
+## Adding a new ANN algorithm
+Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions.
+
+In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
+```c++
+template<typename T>
+class HnswLib : public ANN<T> {
+public:
+  struct BuildParam {
+    int M;
+    int ef_construction;
+    int num_threads;
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads;
+  };
+
+  // ...
+};
+```
+
+The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example:
+```json
+{
+  "name" : "...",
+  "algo" : "hnswlib",
+  "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+  "file" : "/path/to/file",
+  "search_params" : [
+    {"ef":10, "numThreads":1},
+    {"ef":20, "numThreads":1},
+    {"ef":40, "numThreads":1},
+  ],
+  "search_result_file" : "/path/to/file"
+},
+```
+
+How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`:
+1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
+    ```c++
+    template<typename T>
+    void parse_build_param(const nlohmann::json& conf,
+                           typename cuann::HnswLib<T>::BuildParam& param) {
+      param.ef_construction = conf.at("efConstruction");
+      param.M = conf.at("M");
+      if (conf.contains("numThreads")) {
+        param.num_threads = conf.at("numThreads");
+      }
+    }
+
+    template<typename T>
+    void parse_search_param(const nlohmann::json& conf,
+                            typename cuann::HnswLib<T>::SearchParam& param) {
+      param.ef = conf.at("ef");
+      if (conf.contains("numThreads")) {
+        param.num_threads = conf.at("numThreads");
+      }
+    }
+    ```
+
+2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
+    ```c++
+      // JSON configuration file contains a line like:  "algo" : "hnswlib"
+      if (algo == "hnswlib") {
+         // ...
+      }
+    ```
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 814899c36b..23e346c872 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -44,6 +44,7 @@ While not exhaustive, the following general categories help summarize the accele
    developer_guide.md
    cpp_api.rst
    pylibraft_api.rst
+   cuda_ann_benchmarks.md
    raft_dask_api.rst
    using_comms.rst
    using_libraft.md
diff --git a/img/arch.png b/img/arch.png
new file mode 100644
index 0000000000..ea9cad9204
Binary files /dev/null and b/img/arch.png differ
diff --git a/img/raft.png b/img/raft.png
new file mode 100644
index 0000000000..45589614f5
Binary files /dev/null and b/img/raft.png differ
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 77a2a7114e..989241db32 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -36,11 +36,11 @@ option(RAFT_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 
 # If the user requested it we attempt to find RAFT.
 if(FIND_RAFT_CPP)
-  find_package(raft ${pylibraft_version} REQUIRED COMPONENTS distance)
-  if(NOT TARGET raft::raft_distance_lib)
+  find_package(raft ${pylibraft_version} REQUIRED COMPONENTS compiled)
+  if(NOT TARGET raft::raft_lib)
     message(
       FATAL_ERROR
-        "Building against a preexisting libraft library requires the distance components of that library to have been built!"
+        "Building against a preexisting libraft library requires the compiled libraft to have been built!"
     )
 
   endif()
@@ -62,8 +62,7 @@ if(NOT raft_FOUND)
 
   set(BUILD_TESTS OFF)
   set(BUILD_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARIES OFF)
-  set(RAFT_COMPILE_DIST_LIBRARY ON)
+  set(RAFT_COMPILE_LIBRARY ON)
 
   set(_exclude_from_all "")
   if(RAFT_BUILD_WHEELS)
@@ -75,11 +74,11 @@ if(NOT raft_FOUND)
 
   add_subdirectory(../../cpp raft-cpp ${_exclude_from_all})
 
-  # When building the C++ libraries from source we must copy libraft_distance.so alongside the
+  # When building the C++ libraries from source we must copy libraft.so alongside the
   # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
   # library, we shouldn't need this
   set(cython_lib_dir pylibraft)
-  install(TARGETS raft_distance_lib DESTINATION ${cython_lib_dir})
+  install(TARGETS raft_lib DESTINATION ${cython_lib_dir})
 endif()
 
 rapids_cython_init()
diff --git a/python/pylibraft/pylibraft/cluster/CMakeLists.txt b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
index ba77403a5d..7d6e05d918 100644
--- a/python/pylibraft/pylibraft/cluster/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build
 set(cython_sources kmeans.pyx)
-set(linked_libraries raft::distance)
+set(linked_libraries raft::compiled)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx
index c7b42ecab7..f35a94bb9c 100644
--- a/python/pylibraft/pylibraft/common/mdspan.pyx
+++ b/python/pylibraft/pylibraft/common/mdspan.pyx
@@ -159,7 +159,6 @@ cdef device_matrix_view[float, int64_t, row_major] \
     return make_device_matrix_view[float, int64_t, row_major](
         <float*><uintptr_t>cai.data, shape[0], shape[1])
 
-
 cdef device_matrix_view[uint8_t, int64_t, row_major] \
         get_dmv_uint8(cai, check_shape) except *:
     if cai.dtype != np.uint8:
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index cae00007d6..14f0cc441a 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build
 set(cython_sources pairwise_distance.pyx fused_l2_nn.pyx)
-set(linked_libraries raft::raft raft::distance)
+set(linked_libraries raft::raft raft::compiled)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
diff --git a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
index 572ea47f4e..7b9c1591c1 100644
--- a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 # Set the list of Cython files to build
-set(cython_sources common.pyx refine.pyx)
-set(linked_libraries raft::raft raft::distance)
+set(cython_sources common.pyx refine.pyx brute_force.pyx)
+set(linked_libraries raft::raft raft::compiled)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
diff --git a/python/pylibraft/pylibraft/neighbors/__init__.py b/python/pylibraft/pylibraft/neighbors/__init__.py
index f7510ba2db..a50b6f21a7 100644
--- a/python/pylibraft/pylibraft/neighbors/__init__.py
+++ b/python/pylibraft/pylibraft/neighbors/__init__.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from pylibraft.neighbors import brute_force
+
 from .refine import refine
 
-__all__ = ["common", "refine"]
+__all__ = ["common", "refine", "brute_force"]
diff --git a/python/pylibraft/pylibraft/neighbors/brute_force.pyx b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
new file mode 100644
index 0000000000..dbd888756d
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
@@ -0,0 +1,179 @@
+#
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+from cython.operator cimport dereference as deref
+from libcpp cimport bool, nullptr
+from libcpp.vector cimport vector
+
+from pylibraft.distance.distance_type cimport DistanceType
+
+from pylibraft.common import (
+    DeviceResources,
+    auto_convert_output,
+    cai_wrapper,
+    device_ndarray,
+)
+
+from libc.stdint cimport int64_t, uintptr_t
+
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+from pylibraft.common.mdspan cimport get_dmv_float, get_dmv_int64
+
+from pylibraft.common.handle import auto_sync_handle
+from pylibraft.common.input_validation import is_c_contiguous
+from pylibraft.common.interruptible import cuda_interruptible
+
+from pylibraft.distance.distance_type cimport DistanceType
+
+# TODO: Centralize this
+
+from pylibraft.distance.pairwise_distance import DISTANCE_TYPES
+
+from pylibraft.common.cpp.mdspan cimport (
+    device_matrix_view,
+    host_matrix_view,
+    make_device_matrix_view,
+    make_host_matrix_view,
+    row_major,
+)
+from pylibraft.neighbors.cpp.brute_force cimport knn as c_knn
+
+
+def _get_array_params(array_interface, check_dtype=None):
+    dtype = np.dtype(array_interface["typestr"])
+    if check_dtype is None and dtype != check_dtype:
+        raise TypeError("dtype %s not supported" % dtype)
+    shape = array_interface["shape"]
+    if len(shape) != 2:
+        raise ValueError("Expected a 2D array, got %d D" % len(shape))
+    data = array_interface["data"][0]
+    return (shape, dtype, data)
+
+
+@auto_sync_handle
+@auto_convert_output
+def knn(dataset, queries, k=None, indices=None, distances=None,
+        metric="sqeuclidean", metric_arg=2.0,
+        global_id_offset=0, handle=None):
+    """
+    Perform a brute-force nearest neighbors search.
+
+    Parameters
+    ----------
+    dataset : array interface compliant matrix, row-major layout,
+        shape (n_samples, dim). Supported dtype [float]
+    queries : array interface compliant matrix, row-major layout,
+        shape (n_queries, dim) Supported dtype [float]
+    k : int
+        Number of neighbors to search (k <= 2048). Optional if indices or
+        distances arrays are given (in which case their second dimension
+        is k).
+    indices :  Optional array interface compliant matrix shape
+                (n_queries, k), dtype int64_t. If supplied, neighbor
+                indices will be written here in-place. (default None)
+        Supported dtype uint64
+    distances :  Optional array interface compliant matrix shape
+                (n_queries, k), dtype float. If supplied, neighbor
+                indices will be written here in-place. (default None)
+
+    {handle_docstring}
+
+    Returns
+    -------
+    indices: array interface compliant object containing resulting indices
+             shape (n_queries, k)
+
+    distances: array interface compliant object containing resulting distances
+               shape (n_queries, k)
+
+    Examples
+    --------
+
+    >>> import cupy as cp
+
+    >>> from pylibraft.common import DeviceResources
+    >>> from pylibraft.neighbors.brute_force import knn
+
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> n_queries = 1000
+
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Search using the built index
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 40
+    >>> distances, neighbors = knn(dataset, queries, k)
+    >>> distances = cp.asarray(distances)
+    >>> neighbors = cp.asarray(neighbors)
+    """
+
+    if handle is None:
+        handle = DeviceResources()
+
+    dataset_cai = cai_wrapper(dataset)
+    queries_cai = cai_wrapper(queries)
+
+    if k is None:
+        if indices is not None:
+            k = cai_wrapper(indices).shape[1]
+        elif distances is not None:
+            k = cai_wrapper(distances).shape[1]
+        else:
+            raise ValueError("Argument k must be specified if both indices "
+                             "and distances arg is None")
+
+    n_queries = cai_wrapper(queries).shape[0]
+
+    if indices is None:
+        indices = device_ndarray.empty((n_queries, k), dtype='int64')
+
+    if distances is None:
+        distances = device_ndarray.empty((n_queries, k), dtype='float32')
+
+    cdef DistanceType c_metric = DISTANCE_TYPES[metric]
+
+    distances_cai = cai_wrapper(distances)
+    indices_cai = cai_wrapper(indices)
+
+    cdef optional[float] c_metric_arg = <float>metric_arg
+    cdef optional[int64_t] c_global_offset = <int64_t>global_id_offset
+
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
+
+    if dataset_cai.dtype == np.float32:
+        with cuda_interruptible():
+            c_knn(deref(handle_),
+                  get_dmv_float(dataset_cai, check_shape=True),
+                  get_dmv_float(queries_cai, check_shape=True),
+                  get_dmv_int64(indices_cai, check_shape=True),
+                  get_dmv_float(distances_cai, check_shape=True),
+                  c_metric,
+                  c_metric_arg,
+                  c_global_offset)
+    else:
+        raise TypeError("dtype %s not supported" % dataset_cai.dtype)
+
+    return (distances, indices)
diff --git a/python/pylibraft/pylibraft/neighbors/common.pyx b/python/pylibraft/pylibraft/neighbors/common.pyx
index a8380b589b..24c1abcf18 100644
--- a/python/pylibraft/pylibraft/neighbors/common.pyx
+++ b/python/pylibraft/pylibraft/neighbors/common.pyx
@@ -22,13 +22,15 @@ import warnings
 
 from pylibraft.distance.distance_type cimport DistanceType
 
+SUPPORTED_DISTANCES = {
+    "sqeuclidean": DistanceType.L2Expanded,
+    "euclidean": DistanceType.L2SqrtExpanded,
+    "inner_product": DistanceType.InnerProduct,
+
+}
+
 
 def _get_metric(metric):
-    SUPPORTED_DISTANCES = {
-        "sqeuclidean": DistanceType.L2Expanded,
-        "euclidean": DistanceType.L2SqrtExpanded,
-        "inner_product": DistanceType.InnerProduct
-    }
     if metric not in SUPPORTED_DISTANCES:
         if metric == "l2_expanded":
             warnings.warn("Using l2_expanded as a metric name is deprecated,"
diff --git a/python/pylibraft/pylibraft/neighbors/cpp/__init__.pxd b/python/pylibraft/pylibraft/neighbors/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/neighbors/cpp/__init__.py b/python/pylibraft/pylibraft/neighbors/cpp/__init__.py
new file mode 100644
index 0000000000..a7e7b75096
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/cpp/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/neighbors/cpp/brute_force.pxd b/python/pylibraft/pylibraft/neighbors/cpp/brute_force.pxd
new file mode 100644
index 0000000000..de5e0af267
--- /dev/null
+++ b/python/pylibraft/pylibraft/neighbors/cpp/brute_force.pxd
@@ -0,0 +1,55 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import numpy as np
+
+import pylibraft.common.handle
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport int8_t, int64_t, uint8_t, uint64_t, uintptr_t
+from libcpp cimport bool, nullptr
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+
+from rmm._lib.memory_resource cimport device_memory_resource
+
+from pylibraft.common.cpp.mdspan cimport (
+    device_matrix_view,
+    host_matrix_view,
+    make_device_matrix_view,
+    make_host_matrix_view,
+    row_major,
+)
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+from pylibraft.distance.distance_type cimport DistanceType
+
+
+cdef extern from "raft_runtime/neighbors/brute_force.hpp" \
+        namespace "raft::runtime::neighbors::brute_force" nogil:
+
+    cdef void knn(const device_resources & handle,
+                  device_matrix_view[float, int64_t, row_major] index,
+                  device_matrix_view[float, int64_t, row_major] search,
+                  device_matrix_view[int64_t, int64_t, row_major] indices,
+                  device_matrix_view[float, int64_t, row_major] distances,
+                  DistanceType metric,
+                  optional[float] metric_arg,
+                  optional[int64_t] global_id_offset) except +
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
index f183e17157..8f395faec9 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build
 set(cython_sources ivf_flat.pyx)
-set(linked_libraries raft::raft raft::distance)
+set(linked_libraries raft::raft raft::compiled)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
index cfce37b560..e3d721a6ea 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build
 set(cython_sources ivf_pq.pyx)
-set(linked_libraries raft::raft raft::distance)
+set(linked_libraries raft::raft raft::compiled)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index 49ca8627cc..fcc5ee6311 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -17,7 +17,7 @@ set(cython_sources rmat_rectangular_generator.pyx)
 
 # TODO: should finally be replaced with 'compiled' library to be more generic, when that is
 # available
-set(linked_libraries raft::raft raft::distance)
+set(linked_libraries raft::raft raft::compiled)
 
 # Build all of the Cython targets
 rapids_cython_create_modules(
diff --git a/python/pylibraft/pylibraft/test/test_brue_force.py b/python/pylibraft/pylibraft/test/test_brue_force.py
new file mode 100644
index 0000000000..f349be892d
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_brue_force.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from scipy.spatial.distance import cdist
+
+from pylibraft.common import DeviceResources, Stream, device_ndarray
+from pylibraft.neighbors.brute_force import knn
+
+
+@pytest.mark.parametrize("n_index_rows", [32, 100])
+@pytest.mark.parametrize("n_query_rows", [32, 100])
+@pytest.mark.parametrize("n_cols", [40, 100])
+@pytest.mark.parametrize("k", [1, 5, 32])
+@pytest.mark.parametrize(
+    "metric",
+    [
+        "euclidean",
+        "cityblock",
+        "chebyshev",
+        "canberra",
+        "correlation",
+        "russellrao",
+        "cosine",
+        "sqeuclidean",
+        # "inner_product",
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("dtype", [np.float32])
+def test_knn(
+    n_index_rows, n_query_rows, n_cols, k, inplace, metric, order, dtype
+):
+    index = np.random.random_sample((n_index_rows, n_cols)).astype(dtype)
+    queries = np.random.random_sample((n_query_rows, n_cols)).astype(dtype)
+
+    # RussellRao expects boolean arrays
+    if metric == "russellrao":
+        index[index < 0.5] = 0.0
+        index[index >= 0.5] = 1.0
+        queries[queries < 0.5] = 0.0
+        queries[queries >= 0.5] = 1.0
+
+    indices = np.zeros((n_query_rows, k), dtype="int64")
+    distances = np.zeros((n_query_rows, k), dtype=dtype)
+
+    index_device = device_ndarray(index)
+
+    queries_device = device_ndarray(queries)
+    indices_device = device_ndarray(indices)
+    distances_device = device_ndarray(distances)
+
+    s2 = Stream()
+    handle = DeviceResources(stream=s2)
+    ret_distances, ret_indices = knn(
+        index_device,
+        queries_device,
+        k,
+        indices=indices_device,
+        distances=distances_device,
+        metric=metric,
+        handle=handle,
+    )
+    handle.sync()
+
+    pw_dists = cdist(queries, index, metric=metric)
+
+    distances_device = ret_distances if not inplace else distances_device
+
+    actual_distances = distances_device.copy_to_host()
+
+    actual_distances[actual_distances <= 1e-5] = 0.0
+    argsort = np.argsort(pw_dists, axis=1)
+
+    for i in range(pw_dists.shape[0]):
+        expected_indices = argsort[i]
+        gpu_dists = actual_distances[i]
+
+        if metric == "correlation" or metric == "cosine":
+            gpu_dists = gpu_dists[::-1]
+
+        cpu_ordered = pw_dists[i, expected_indices]
+        np.testing.assert_allclose(
+            cpu_ordered[:k], gpu_dists, atol=1e-4, rtol=1e-4
+        )
diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/test/test_doctests.py
index 3276ca115f..34be6c55f5 100644
--- a/python/pylibraft/pylibraft/test/test_doctests.py
+++ b/python/pylibraft/pylibraft/test/test_doctests.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -96,6 +96,7 @@ def _find_doctests_in_obj(obj, finder=None, criteria=None):
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.distance))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_pq))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.brute_force))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.random))
 
 
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 785a6df6c8..4fe0a52ce6 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -15,15 +15,15 @@
 [build-system]
 
 requires = [
-    "wheel",
-    "setuptools",
-    "cython>=0.29,<0.30",
-    "cuda-python>=11.7.1,<12.0",
-    "scikit-build>=0.13.1",
     "cmake>=3.23.1,!=3.25.0",
+    "cuda-python >=11.7.1,<12.0",
+    "cython>=0.29,<0.30",
     "ninja",
     "rmm==23.6.*",
-]
+    "scikit-build>=0.13.1",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -37,10 +37,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "numpy",
-    "cuda-python>=11.7.1,<12.0",
+    "cuda-python >=11.7.1,<12.0",
+    "numpy>=1.21",
     "rmm==23.6.*",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
@@ -50,10 +50,12 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "cupy",
     "pytest",
-    "scipy",
+    "pytest-cov",
     "scikit-learn",
-]
+    "scipy",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/raft"
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
new file mode 100644
index 0000000000..7d1a0c9065
--- /dev/null
+++ b/python/pylibraft/setup.cfg
@@ -0,0 +1,38 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+
+[isort]
+line_length=79
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+combine_as_imports=True
+order_by_type=True
+known_dask=
+    dask
+    distributed
+    dask_cuda
+known_rapids=
+    nvtext
+    cudf
+    cuml
+    cugraph
+    dask_cudf
+    rmm
+known_first_party=
+    raft
+    pylibraft
+default_section=THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
+skip=
+    thirdparty
+    .eggs
+    .git
+    .hg
+    .mypy_cache
+    .tox
+    .venv
+    _build
+    buck-out
+    build
+    dist
+    __init__.py
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 88ac8d80ac..1fb5aa8f7c 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -15,13 +15,13 @@
 [build-system]
 
 requires = [
-    "wheel",
-    "setuptools",
-    "cython>=0.29,<0.30",
-    "scikit-build>=0.13.1",
     "cmake>=3.23.1,!=3.25.0",
+    "cython>=0.29,<0.30",
     "ninja",
-]
+    "scikit-build>=0.13.1",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "raft-dask"
@@ -34,15 +34,15 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "numpy",
-    "numba>=0.49",
-    "joblib>=0.11",
     "dask-cuda==23.6.*",
     "dask>=2023.1.1",
-    "ucx-py==0.32.*",
     "distributed>=2023.1.1",
+    "joblib>=0.11",
+    "numba>=0.49",
+    "numpy>=1.21",
     "pylibraft==23.6.*",
-]
+    "ucx-py==0.32.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
@@ -53,8 +53,8 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "pytest",
-    "dask[distributed,dataframe]",
-]
+    "pytest-cov",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/raft"
diff --git a/python/raft-dask/raft_dask/common/comms.py b/python/raft-dask/raft_dask/common/comms.py
index 56e40b98da..ebe9a8dc4f 100644
--- a/python/raft-dask/raft_dask/common/comms.py
+++ b/python/raft-dask/raft_dask/common/comms.py
@@ -19,7 +19,7 @@
 import warnings
 from collections import OrderedDict
 
-from dask.distributed import default_client, get_worker
+from dask.distributed import default_client
 
 from pylibraft.common.handle import Handle
 
@@ -242,7 +242,7 @@ def destroy(self):
         self.ucx_initialized = False
 
 
-def local_handle(sessionId):
+def local_handle(sessionId, dask_worker=None):
     """
     Simple helper function for retrieving the local handle_t instance
     for a comms session on a worker.
@@ -251,16 +251,19 @@ def local_handle(sessionId):
     ----------
     sessionId : str
                 session identifier from an initialized comms instance
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
 
     Returns
     -------
     handle : raft.Handle or None
     """
-    state = get_raft_comm_state(sessionId, get_worker())
+    state = get_raft_comm_state(sessionId, dask_worker)
     return state["handle"] if "handle" in state else None
 
 
-def get_raft_comm_state(sessionId, state_object=None):
+def get_raft_comm_state(sessionId, state_object=None, dask_worker=None):
     """
     Retrieves cuML comms state on the scheduler node, for the given sessionId,
     creating a new session if it does not exist. If no session id is given,
@@ -271,13 +274,16 @@ def get_raft_comm_state(sessionId, state_object=None):
     sessionId : SessionId value to retrieve from the dask_scheduler instances
     state_object : Object (either Worker, or Scheduler) on which the raft
                    comm state will retrieved (or created)
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
 
     Returns
     -------
     session state : str
                     session state associated with sessionId
     """
-    state_object = state_object if state_object is not None else get_worker()
+    state_object = state_object if state_object is not None else dask_worker
 
     if not hasattr(state_object, "_raft_comm_state"):
         state_object._raft_comm_state = {}
@@ -308,13 +314,19 @@ def set_nccl_root(sessionId, state_object):
     return raft_comm_state["nccl_uid"]
 
 
-def get_ucx():
+def get_ucx(dask_worker=None):
     """
     A simple convenience wrapper to make sure UCP listener and
     endpoints are only ever assigned once per worker.
+
+    Parameters
+    ----------
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
     """
     raft_comm_state = get_raft_comm_state(
-        sessionId="ucp", state_object=get_worker()
+        sessionId="ucp", state_object=dask_worker
     )
     if "ucx" not in raft_comm_state:
         raft_comm_state["ucx"] = UCX.get()
@@ -371,7 +383,7 @@ def _func_set_scheduler_as_nccl_root(sessionId, verbose, dask_scheduler):
     return nccl_uid
 
 
-def _func_set_worker_as_nccl_root(sessionId, verbose):
+def _func_set_worker_as_nccl_root(sessionId, verbose, dask_worker=None):
     """
     Creates a persistent nccl uniqueId on the scheduler node.
 
@@ -380,63 +392,74 @@ def _func_set_worker_as_nccl_root(sessionId, verbose):
     ----------
     sessionId : Associated session to attach the unique ID to.
     verbose : Indicates whether or not to emit additional information
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
 
     Return
     ------
     uniqueId : byte str
                 NCCL uniqueId, associating this DASK worker as its root node.
     """
-    worker = get_worker()
     if verbose:
-        worker.log_event(
+        dask_worker.log_event(
             topic="info",
             msg=f"Setting worker as NCCL root for session, '{sessionId}'",
         )
 
-    nccl_uid = set_nccl_root(sessionId=sessionId, state_object=worker)
+    nccl_uid = set_nccl_root(sessionId=sessionId, state_object=dask_worker)
 
     if verbose:
-        worker.log_event(
+        dask_worker.log_event(
             topic="info", msg="Done setting scheduler as NCCL root."
         )
 
     return nccl_uid
 
 
-def _func_ucp_listener_port():
-    return get_ucx().listener_port()
+def _func_ucp_listener_port(dask_worker=None):
+    return get_ucx(dask_worker=dask_worker).listener_port()
 
 
 async def _func_init_all(
-    sessionId, uniqueId, comms_p2p, worker_info, verbose, streams_per_handle
+    sessionId,
+    uniqueId,
+    comms_p2p,
+    worker_info,
+    verbose,
+    streams_per_handle,
+    dask_worker=None,
 ):
-    worker = get_worker()
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
+        sessionId=sessionId, state_object=dask_worker
     )
     raft_comm_state["nccl_uid"] = uniqueId
-    raft_comm_state["wid"] = worker_info[get_worker().address]["rank"]
+    raft_comm_state["wid"] = worker_info[dask_worker.address]["rank"]
     raft_comm_state["nworkers"] = len(worker_info)
 
     if verbose:
-        worker.log_event(topic="info", msg="Initializing NCCL.")
+        dask_worker.log_event(topic="info", msg="Initializing NCCL.")
         start = time.time()
 
-    _func_init_nccl(sessionId, uniqueId)
+    _func_init_nccl(sessionId, uniqueId, dask_worker=dask_worker)
 
     if verbose:
         elapsed = time.time() - start
-        worker.log_event(
+        dask_worker.log_event(
             topic="info", msg=f"NCCL Initialization took: {elapsed} seconds."
         )
 
     if comms_p2p:
         if verbose:
-            worker.log_event(topic="info", msg="Initializing UCX Endpoints")
+            dask_worker.log_event(
+                topic="info", msg="Initializing UCX Endpoints"
+            )
 
         if verbose:
             start = time.time()
-        await _func_ucp_create_endpoints(sessionId, worker_info)
+        await _func_ucp_create_endpoints(
+            sessionId, worker_info, dask_worker=dask_worker
+        )
 
         if verbose:
             elapsed = time.time() - start
@@ -444,18 +467,22 @@ async def _func_init_all(
                 f"Done initializing UCX endpoints."
                 f"Took: {elapsed} seconds.\nBuilding handle."
             )
-            worker.log_event(topic="info", msg=msg)
+            dask_worker.log_event(topic="info", msg=msg)
 
-        _func_build_handle_p2p(sessionId, streams_per_handle, verbose)
+        _func_build_handle_p2p(
+            sessionId, streams_per_handle, verbose, dask_worker=dask_worker
+        )
 
         if verbose:
-            worker.log_event(topic="info", msg="Done building handle.")
+            dask_worker.log_event(topic="info", msg="Done building handle.")
 
     else:
-        _func_build_handle(sessionId, streams_per_handle, verbose)
+        _func_build_handle(
+            sessionId, streams_per_handle, verbose, dask_worker=dask_worker
+        )
 
 
-def _func_init_nccl(sessionId, uniqueId):
+def _func_init_nccl(sessionId, uniqueId, dask_worker=None):
     """
     Initialize ncclComm_t on worker
 
@@ -466,11 +493,13 @@ def _func_init_nccl(sessionId, uniqueId):
     uniqueId : array[byte]
                The NCCL unique Id generated from the
                client.
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
     """
 
-    worker = get_worker()
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=get_worker()
+        sessionId=sessionId, state_object=dask_worker, dask_worker=dask_worker
     )
     wid = raft_comm_state["wid"]
     nWorkers = raft_comm_state["nworkers"]
@@ -480,13 +509,15 @@ def _func_init_nccl(sessionId, uniqueId):
         n.init(nWorkers, uniqueId, wid)
         raft_comm_state["nccl"] = n
     except Exception as e:
-        worker.log_event(
+        dask_worker.log_event(
             topic="error", msg=f"An error occurred initializing NCCL: {e}."
         )
         raise
 
 
-def _func_build_handle_p2p(sessionId, streams_per_handle, verbose):
+def _func_build_handle_p2p(
+    sessionId, streams_per_handle, verbose, dask_worker=None
+):
     """
     Builds a handle_t on the current worker given the initialized comms
 
@@ -495,14 +526,16 @@ def _func_build_handle_p2p(sessionId, streams_per_handle, verbose):
     sessionId : str id to reference state for current comms instance.
     streams_per_handle : int number of internal streams to create
     verbose : bool print verbose logging output
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
     """
-    worker = get_worker()
     if verbose:
-        worker.log_event(topic="info", msg="Building p2p handle.")
+        dask_worker.log_event(topic="info", msg="Building p2p handle.")
 
-    ucp_worker = get_ucx().get_worker()
+    ucp_worker = get_ucx(dask_worker).get_worker()
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
+        sessionId=sessionId, state_object=dask_worker
     )
 
     handle = Handle(n_streams=streams_per_handle)
@@ -512,21 +545,23 @@ def _func_build_handle_p2p(sessionId, streams_per_handle, verbose):
     workerId = raft_comm_state["wid"]
 
     if verbose:
-        worker.log_event(topic="info", msg="Injecting comms on handle.")
+        dask_worker.log_event(topic="info", msg="Injecting comms on handle.")
 
     inject_comms_on_handle(
         handle, nccl_comm, ucp_worker, eps, nWorkers, workerId, verbose
     )
 
     if verbose:
-        worker.log_event(
+        dask_worker.log_event(
             topic="info", msg="Finished injecting comms on handle."
         )
 
     raft_comm_state["handle"] = handle
 
 
-def _func_build_handle(sessionId, streams_per_handle, verbose):
+def _func_build_handle(
+    sessionId, streams_per_handle, verbose, dask_worker=None
+):
     """
     Builds a handle_t on the current worker given the initialized comms
 
@@ -535,17 +570,19 @@ def _func_build_handle(sessionId, streams_per_handle, verbose):
     sessionId : str id to reference state for current comms instance.
     streams_per_handle : int number of internal streams to create
     verbose : bool print verbose logging output
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
     """
-    worker = get_worker()
     if verbose:
-        worker.log_event(
+        dask_worker.log_event(
             topic="info", msg="Finished injecting comms on handle."
         )
 
     handle = Handle(n_streams=streams_per_handle)
 
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
+        sessionId=sessionId, state_object=dask_worker
     )
 
     workerId = raft_comm_state["wid"]
@@ -558,16 +595,18 @@ def _func_build_handle(sessionId, streams_per_handle, verbose):
     raft_comm_state["handle"] = handle
 
 
-def _func_store_initial_state(nworkers, sessionId, uniqueId, wid):
+def _func_store_initial_state(
+    nworkers, sessionId, uniqueId, wid, dask_worker=None
+):
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=get_worker()
+        sessionId=sessionId, state_object=dask_worker
     )
     raft_comm_state["nccl_uid"] = uniqueId
     raft_comm_state["wid"] = wid
     raft_comm_state["nworkers"] = nworkers
 
 
-async def _func_ucp_create_endpoints(sessionId, worker_info):
+async def _func_ucp_create_endpoints(sessionId, worker_info, dask_worker):
     """
     Runs on each worker to create ucp endpoints to all other workers
 
@@ -577,6 +616,9 @@ async def _func_ucp_create_endpoints(sessionId, worker_info):
                 uuid unique id for this instance
     worker_info : dict
                   Maps worker addresses to NCCL ranks & UCX ports
+    dask_worker : dask_worker object
+                  (Note: if called by client.run(), this is supplied by Dask
+                   and not the client)
     """
     eps = [None] * len(worker_info)
     count = 1
@@ -584,40 +626,47 @@ async def _func_ucp_create_endpoints(sessionId, worker_info):
     for k in worker_info:
         ip, port = parse_host_port(k)
 
-        ep = await get_ucx().get_endpoint(ip, worker_info[k]["port"])
+        ep = await get_ucx(dask_worker=dask_worker).get_endpoint(
+            ip, worker_info[k]["port"]
+        )
 
         eps[worker_info[k]["rank"]] = ep
         count += 1
 
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=get_worker()
+        sessionId=sessionId, state_object=dask_worker
     )
     raft_comm_state["ucp_eps"] = eps
 
 
-async def _func_destroy_all(sessionId, comms_p2p, verbose=False):
-    worker = get_worker()
+async def _func_destroy_all(
+    sessionId, comms_p2p, verbose=False, dask_worker=None
+):
     if verbose:
-        worker.log_event(topic="info", msg="Destroying NCCL session state.")
+        dask_worker.log_event(
+            topic="info", msg="Destroying NCCL session state."
+        )
 
     raft_comm_state = get_raft_comm_state(
-        sessionId=sessionId, state_object=worker
+        sessionId=sessionId, state_object=dask_worker
     )
     if "nccl" in raft_comm_state:
         raft_comm_state["nccl"].destroy()
         del raft_comm_state["nccl"]
         if verbose:
-            worker.log_event(topic="info", msg="NCCL session state destroyed.")
+            dask_worker.log_event(
+                topic="info", msg="NCCL session state destroyed."
+            )
     else:
         if verbose:
-            worker.log_event(
+            dask_worker.log_event(
                 topic="warning",
                 msg=f"Session state for, '{sessionId}', "
                 f"does not contain expected 'nccl' element",
             )
 
     if verbose:
-        worker.log_event(
+        dask_worker.log_event(
             topic="info",
             msg=f"Destroying CUDA handle for sessionId, '{sessionId}.'",
         )
@@ -626,7 +675,7 @@ async def _func_destroy_all(sessionId, comms_p2p, verbose=False):
         del raft_comm_state["handle"]
     else:
         if verbose:
-            worker.log_event(
+            dask_worker.log_event(
                 topic="warning",
                 msg=f"Session state for, '{sessionId}', "
                 f"does not contain expected 'handle' element",
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/test/test_comms.py
index 74ec446e94..3a430f9270 100644
--- a/python/raft-dask/raft_dask/test/test_comms.py
+++ b/python/raft-dask/raft_dask/test/test_comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 import pytest
 
-from dask.distributed import Client, wait
+from dask.distributed import Client, get_worker, wait
 
 try:
     from raft_dask.common import (
@@ -60,32 +60,32 @@ def test_comms_init_no_p2p(cluster):
 
 
 def func_test_collective(func, sessionId, root):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return func(handle, root)
 
 
 def func_test_send_recv(sessionId, n_trials):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return perform_test_comms_send_recv(handle, n_trials)
 
 
 def func_test_device_send_or_recv(sessionId, n_trials):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return perform_test_comms_device_send_or_recv(handle, n_trials)
 
 
 def func_test_device_sendrecv(sessionId, n_trials):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return perform_test_comms_device_sendrecv(handle, n_trials)
 
 
 def func_test_device_multicast_sendrecv(sessionId, n_trials):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return perform_test_comms_device_multicast_sendrecv(handle, n_trials)
 
 
 def func_test_comm_split(sessionId, n_trials):
-    handle = local_handle(sessionId)
+    handle = local_handle(sessionId, dask_worker=get_worker())
     return perform_test_comm_split(handle, n_trials)
 
 
@@ -114,11 +114,9 @@ def func_check_uid_on_scheduler(sessionId, uniqueId, dask_scheduler):
     )
 
 
-def func_check_uid_on_worker(sessionId, uniqueId):
-    from dask.distributed import get_worker
-
+def func_check_uid_on_worker(sessionId, uniqueId, dask_worker=None):
     return func_check_uid(
-        sessionId=sessionId, uniqueId=uniqueId, state_object=get_worker()
+        sessionId=sessionId, uniqueId=uniqueId, state_object=dask_worker
     )
 
 
@@ -127,7 +125,7 @@ def test_handles(cluster):
     client = Client(cluster)
 
     def _has_handle(sessionId):
-        return local_handle(sessionId) is not None
+        return local_handle(sessionId, dask_worker=get_worker()) is not None
 
     try:
         cb = Comms(verbose=True)
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000..e64641d05b
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,55 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+
+[flake8]
+filename = *.py, *.pyx, *.pxd, *.pxi
+exclude = __init__.py, *.egg, build, docs, .git
+force-check = True
+ignore =
+    # line break before binary operator
+    W503,
+    # whitespace before :
+    E203
+per-file-ignores =
+    # Rules ignored only in Cython:
+    # E211: whitespace before '(' (used in multi-line imports)
+    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
+    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
+    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
+    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
+    # E402: invalid syntax (works for Python, not Cython)
+    # E999: invalid syntax (works for Python, not Cython)
+    # W504: line break after binary operator (breaks lines that end with a pointer)
+    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
+
+[pydocstyle]
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
+# than include using match-dir. Note that as discussed in
+# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
+# unlike the match option above this match-dir will have no effect when
+# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
+# also be maintained in the pre-commit config file.
+match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$
+# Allow missing docstrings for docutils
+ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
+select =
+    D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418
+    # Would like to enable the following rules in the future:
+    # D200, D202, D205, D400
+
+[mypy]
+ignore_missing_imports = True
+# If we don't specify this, then mypy will check excluded files if
+# they are imported by a checked file.
+follow_imports = skip
+
+[codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b
+ignore-words-list = inout,unparseable,numer
+builtin = clear
+quiet-level = 3
diff --git a/thirdparty/LICENSES/LICENSE.pytorch b/thirdparty/LICENSES/LICENSE.pytorch
new file mode 100644
index 0000000000..7ad3d737a5
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.pytorch
@@ -0,0 +1,77 @@
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file