diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 620a13fe17..8f745848e0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-matrix-build.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-118
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,9 +46,52 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-118
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+  wheel-build-pylibraft:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-118
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: pylibraft
+      package-dir: python/pylibraft
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-publish-pylibraft:
+    needs: wheel-build-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-118
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: pylibraft
+  wheel-build-raft-dask:
+    needs: wheel-publish-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-118
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: raft_dask
+      package-dir: python/raft-dask
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-publish-raft-dask:
+    needs: wheel-build-raft-dask
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-118
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: raft_dask
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ca2e2356c0..b705557795 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -17,33 +17,80 @@ jobs:
       - conda-cpp-tests
       - conda-python-build
       - conda-python-tests
+      - wheel-build-pylibraft
+      - wheel-tests-pylibraft
+      - wheel-build-raft-dask
+      - wheel-tests-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-118
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-118
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-matrix-build.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-118
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-118
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
     with:
       build_type: pull-request
+  wheel-build-pylibraft:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-118
+    with:
+      build_type: pull-request
+      package-name: pylibraft
+      package-dir: python/pylibraft
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-tests-pylibraft:
+    needs: wheel-build-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-118
+    with:
+      build_type: pull-request
+      package-name: pylibraft
+      test-before-amd64: "pip install cupy-cuda11x"
+      # On arm also need to install cupy from the specific webpage.
+      test-before-arm64: "pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+      test-smoketest: "python ./ci/wheel_smoke_test_pylibraft.py"
+  wheel-build-raft-dask:
+    needs: wheel-tests-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-118
+    with:
+      build_type: pull-request
+      package-name: raft_dask
+      package-dir: python/raft-dask
+      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-wheelhouse"
+      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+  wheel-tests-raft-dask:
+    needs: wheel-build-raft-dask
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-118
+    with:
+      build_type: pull-request
+      package-name: raft_dask
+      # Always want to test against latest dask/distributed.
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
+      test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index bd201e987f..d41a660c6d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-118
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,9 +24,33 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+  wheel-tests-pylibraft:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-118
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      package-name: pylibraft
+      test-before-amd64: "pip install cupy-cuda11x"
+      test-before-arm64: "pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+  wheel-tests-raft-dask:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-118
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      package-name: raft_dask
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.02"
+      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
deleted file mode 100644
index 0a681b864b..0000000000
--- a/.github/workflows/wheels.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: RAFT wheels
-
-on:
-  workflow_call:
-    inputs:
-      versioneer-override:
-        type: string
-        default: ''
-      build-tag:
-        type: string
-        default: ''
-      branch:
-        required: true
-        type: string
-      date:
-        required: true
-        type: string
-      sha:
-        required: true
-        type: string
-      build-type:
-        type: string
-        default: nightly
-
-concurrency:
-  group: "raft-${{ github.workflow }}-${{ github.ref }}"
-  cancel-in-progress: true
-
-jobs:
-  pylibraft-wheel:
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
-    with:
-      repo: rapidsai/raft
-
-      build-type: ${{ inputs.build-type }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-
-      package-dir: python/pylibraft
-      package-name: pylibraft
-
-      python-package-versioneer-override: ${{ inputs.versioneer-override }}
-      python-package-build-tag: ${{ inputs.build-tag }}
-
-      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-
-      test-extras: test
-      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
-    secrets: inherit
-  raft-dask-wheel:
-    needs: pylibraft-wheel
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
-    with:
-      repo: rapidsai/raft
-
-      build-type: ${{ inputs.build-type }}
-      branch: ${{ inputs.branch }}
-      sha: ${{ inputs.sha }}
-      date: ${{ inputs.date }}
-
-      package-dir: python/raft-dask
-      package-name: raft_dask
-
-      python-package-versioneer-override: ${{ inputs.versioneer-override }}
-      python-package-build-tag: ${{ inputs.build-tag }}
-
-      skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-
-      test-extras: test
-      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
-    secrets: inherit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f48dff11cd..b766bfc066 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
       - repo: https://github.com/PyCQA/isort
-        rev: 5.10.1
+        rev: 5.12.0
         hooks:
               - id: isort
                 # Use the config file specific to each subproject so that each
diff --git a/README.md b/README.md
index 8e0da6cd6d..ccd0df4926 100755
--- a/README.md
+++ b/README.md
@@ -25,8 +25,8 @@ While not exhaustive, the following general categories help summarize the accele
 | Category | Examples |
 | --- | --- |
 | **Data Formats** | sparse & dense, conversions, data generation |
-| **Dense Operations** | linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems |
-| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, components & labeling |
+| **Dense Operations** | linear algebra, matrix and vector operations, reductions, slicing, norms, factorization, least squares, svd & eigenvalue problems |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
 | **Solvers** | combinatorial optimization, iterative solvers |
@@ -65,17 +65,17 @@ auto matrix = raft::make_device_matrix<float>(handle, n_rows, n_cols);
 
 ### C++ Example
 
-Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
+Most of the primitives in RAFT accept a `raft::device_resources` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
 
 The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
 
-raft::handle_t handle;
+raft::device_resources handle;
 
 int n_samples = 5000;
 int n_features = 50;
@@ -93,12 +93,12 @@ raft::distance::pairwise_distance(handle, input.view(), input.view(), output.vie
 It's also possible to create `raft::device_mdspan` views to invoke the same API with raw pointers and shape information:
 
 ```c++
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/distance/distance.cuh>
 
-raft::handle_t handle;
+raft::device_resources handle;
 
 int n_samples = 5000;
 int n_features = 50;
@@ -277,7 +277,7 @@ Several CMake targets can be made available by adding components in the table be
 The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
 1. Create an environment with the needed dependencies:
 ```
-mamba env create --name raft_dev_env -f conda/environments/all_cuda-115_arch-x86_64.yaml
+mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86_64.yaml
 mamba activate raft_dev_env
 ```
 ```
@@ -315,6 +315,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
       - `solver`: Sparse solvers for optimization and approximation
     - `stats`: Moments, summary statistics, model performance measures
     - `util`: Various reusable tools and utilities for accelerated algorithm development
+  - `internal`: A private header-only component that hosts the code shared between benchmarks and tests.
   - `scripts`: Helpful scripts for development
   - `src`: Compiled APIs and template specializations for the shared libraries
   - `test`: Googletests source code
diff --git a/build.sh b/build.sh
index 34dcd3a2db..b47e1ed862 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 # raft build script
 
@@ -153,6 +153,7 @@ function limitTests {
             # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
             ARGS=${ARGS//--limit-tests=$LIMIT_TEST_TARGETS/}
             TEST_TARGETS=${LIMIT_TEST_TARGETS}
+	    echo "Limiting tests to $TEST_TARGETS"
         fi
     fi
 }
@@ -387,7 +388,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
     else
-        RAFT_CMAKE_CUDA_ARCHITECTURES="ALL"
+        RAFT_CMAKE_CUDA_ARCHITECTURES="RAPIDS"
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index bfef5392f5..43a4a186f8 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@
     re.compile(r"setup[.]cfg$"),
     re.compile(r"meta[.]yaml$")
 ]
-ExemptFiles = ["cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh"]
+ExemptFiles = ["cpp/include/raft/spatial/knn/detail/faiss_select/"]
 
 # this will break starting at year 10000, which is probably OK :)
 CheckSimple = re.compile(
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 2f0e2b94ca..657126fdf0 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -43,7 +43,7 @@ export CMAKE_GENERATOR="Ninja"
 export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
 
 # ucx-py version
-export UCX_PY_VERSION='0.30.*'
+export UCX_PY_VERSION='0.31.*'
 
 ################################################################################
 # SETUP - Check environment
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 1808480d37..84026203fa 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -38,7 +38,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 unset GIT_DESCRIBE_TAG
 
 # ucx-py version
-export UCX_PY_VERSION='0.30.*'
+export UCX_PY_VERSION='0.31.*'
 
 # Whether to install dask nightly or stable packages.
 export INSTALL_DASK_MAIN=1
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 0b6410f9c9..00f6905032 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 ########################
 # RAFT Version Updater #
 ########################
@@ -17,12 +17,14 @@ CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}')
 CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
+CURRENT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${CURRENT_SHORT_TAG}).*"
 
 #Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
+NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
+NEXT_UCX_PY_VERSION="${NEXT_UCX_PY_SHORT_TAG}.*"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -53,3 +55,17 @@ done
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/cpu/build.sh
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+
+# Wheel builds install dask-cuda from source, update its branch
+sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" .github/workflows/*.yaml
+
+# Need to distutils-normalize the original version
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))")
+
+# Wheel builds install intra-RAPIDS dependencies from same release
+sed_runner "s/{cuda_suffix}[^\"].*\",/{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pylibraft/setup.py
+sed_runner "s/{cuda_suffix}.*\"\]/{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\"\]/g" python/pylibraft/_custom_build/backend.py
+sed_runner "s/dask-cuda==.*\",/dask-cuda==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/setup.py
+sed_runner "s/pylibraft{cuda_suffix}.*\",/pylibraft{cuda_suffix}==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/setup.py
+sed_runner "s/ucx-py{cuda_suffix}.*\",/ucx-py{cuda_suffix}==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/setup.py
diff --git a/ci/wheel_smoke_test_pylibraft.py b/ci/wheel_smoke_test_pylibraft.py
new file mode 100644
index 0000000000..7fee674691
--- /dev/null
+++ b/ci/wheel_smoke_test_pylibraft.py
@@ -0,0 +1,38 @@
+import numpy as np
+from scipy.spatial.distance import cdist
+
+from pylibraft.common import Handle, Stream, device_ndarray
+from pylibraft.distance import pairwise_distance
+
+
+if __name__ == "__main__":
+    metric = "euclidean"
+    n_rows = 1337
+    n_cols = 1337
+
+    input1 = np.random.random_sample((n_rows, n_cols))
+    input1 = np.asarray(input1, order="C").astype(np.float64)
+
+    output = np.zeros((n_rows, n_rows), dtype=np.float64)
+
+    expected = cdist(input1, input1, metric)
+
+    expected[expected <= 1e-5] = 0.0
+
+    input1_device = device_ndarray(input1)
+    output_device = None
+
+    s2 = Stream()
+    handle = Handle(stream=s2)
+    ret_output = pairwise_distance(
+        input1_device, input1_device, output_device, metric, handle=handle
+    )
+    handle.sync()
+
+    output_device = ret_output
+
+    actual = output_device.copy_to_host()
+
+    actual[actual <= 1e-5] = 0.0
+
+    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/ci/wheel_smoke_test_raft_dask.py b/ci/wheel_smoke_test_raft_dask.py
new file mode 100644
index 0000000000..32c13e61ca
--- /dev/null
+++ b/ci/wheel_smoke_test_raft_dask.py
@@ -0,0 +1,92 @@
+from dask.distributed import Client, wait
+from dask_cuda import LocalCUDACluster, initialize
+
+from raft_dask.common import (
+    Comms,
+    local_handle,
+    perform_test_comm_split,
+    perform_test_comms_allgather,
+    perform_test_comms_allreduce,
+    perform_test_comms_bcast,
+    perform_test_comms_device_multicast_sendrecv,
+    perform_test_comms_device_send_or_recv,
+    perform_test_comms_device_sendrecv,
+    perform_test_comms_gather,
+    perform_test_comms_gatherv,
+    perform_test_comms_reduce,
+    perform_test_comms_reducescatter,
+    perform_test_comms_send_recv,
+)
+
+import os
+os.environ["UCX_LOG_LEVEL"] = "error"
+
+
+def func_test_send_recv(sessionId, n_trials):
+    handle = local_handle(sessionId)
+    return perform_test_comms_send_recv(handle, n_trials)
+
+
+def func_test_collective(func, sessionId, root):
+    handle = local_handle(sessionId)
+    return func(handle, root)
+
+
+if __name__ == "__main__":
+    # initial setup
+    cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
+    client = Client(cluster)
+
+    n_trials = 5
+    root_location = "client"
+
+    # p2p test for ucx
+    cb = Comms(comms_p2p=True, verbose=True)
+    cb.init()
+
+    dfs = [
+        client.submit(
+            func_test_send_recv,
+            cb.sessionId,
+            n_trials,
+            pure=False,
+            workers=[w],
+        )
+        for w in cb.worker_addresses
+    ]
+
+    wait(dfs, timeout=5)
+
+    assert list(map(lambda x: x.result(), dfs))
+
+    cb.destroy()
+
+    # collectives test for nccl
+
+    cb = Comms(
+        verbose=True, client=client, nccl_root_location=root_location
+    )
+    cb.init()
+
+    for k, v in cb.worker_info(cb.worker_addresses).items():
+
+        dfs = [
+            client.submit(
+                func_test_collective,
+                perform_test_comms_allgather,
+                cb.sessionId,
+                v["rank"],
+                pure=False,
+                workers=[w],
+            )
+            for w in cb.worker_addresses
+        ]
+        wait(dfs, timeout=5)
+
+        assert all([x.result() for x in dfs])
+
+    cb.destroy()
+
+    # final client and cluster teardown
+    client.close()
+    cluster.close()
diff --git a/conda/environments/all_cuda-115_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
similarity index 56%
rename from conda/environments/all_cuda-115_arch-x86_64.yaml
rename to conda/environments/all_cuda-118_arch-x86_64.yaml
index 18e0a8187f..f194b152a6 100644
--- a/conda/environments/all_cuda-115_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -12,37 +12,37 @@ dependencies:
 - clang-tools=11.1.0
 - clang=11.1.0
 - cmake>=3.23.1,!=3.25.0
-- cuda-profiler-api>=11.4.240,<=11.8.86
+- cuda-profiler-api=11.8.86
 - cuda-python >=11.7.1,<12.0
-- cudatoolkit=11.5
+- cudatoolkit=11.8
 - cupy
 - cxx-compiler
 - cython>=0.29,<0.30
-- dask-cuda=23.02.*
+- dask-cuda=23.04
 - dask>=2022.12.0
 - distributed>=2022.12.0
 - doxygen>=1.8.20
 - faiss-proc=*=cuda
-- gcc_linux-64=9.*
-- libcublas-dev>=11.7.3.1,<=11.7.4.6
-- libcublas>=11.7.3.1,<=11.7.4.6
-- libcurand-dev>=10.2.6.48,<=10.2.7.107
-- libcurand>=10.2.6.48,<=10.2.7.107
-- libcusolver-dev>=11.2.1.48,<=11.3.2.107
-- libcusolver>=11.2.1.48,<=11.3.2.107
-- libcusparse-dev>=11.7.0.31,<=11.7.0.107
-- libcusparse>=11.7.0.31,<=11.7.0.107
-- libfaiss>=1.7.0=cuda*
+- gcc_linux-64=9
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- libfaiss>=1.7.1=cuda*
 - ninja
 - pytest
 - pytest-cov
-- rmm=23.02.*
+- rmm=23.04
 - scikit-build>=0.13.1
 - scikit-learn
 - scipy
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py=0.30.*
+- ucx-py=0.31.*
 - ucx>=1.13.0
-name: all_cuda-115_arch-x86_64
+name: all_cuda-118_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 399dd198eb..1012bddb40 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,42 +20,42 @@ gtest_version:
   - "=1.10.0"
 
 libfaiss_version:
-  - "1.7.0 *_cuda"
+  - "1.7.2 *_cuda"
 
 # The CTK libraries below are missing from the conda-forge::cudatoolkit
-# package. The "*_host_*" version specifiers correspond to `11.5` packages and the
+# package. The "*_host_*" version specifiers correspond to `11.8` packages and the
 # "*_run_*" version specifiers correspond to `11.x` packages.
 
 libcublas_host_version:
-  - ">=11.7.3.1,<=11.7.4.6"
+  - "=11.11.3.6"
 
 libcublas_run_version:
-  - ">=11.5.2.43,<=11.11.3.6"
+  - ">=11.5.2.43,<12.0.0"
 
 libcurand_host_version:
-  - ">=10.2.6.48,<=10.2.7.107"
+  - "=10.3.0.86"
 
 libcurand_run_version:
-  - ">=10.2.5.43,<=10.3.0.86"
+  - ">=10.2.5.43,<10.3.1"
 
 libcusolver_host_version:
-  - ">=11.2.1.48,<=11.3.2.107"
+  - "=11.4.1.48"
 
 libcusolver_run_version:
-  - ">=11.2.0.43,<=11.4.1.48"
+  - ">=11.2.0.43,<11.4.2"
 
 libcusparse_host_version:
-  - ">=11.7.0.31,<=11.7.0.107"
+  - "=11.7.5.86"
 
 libcusparse_run_version:
-  - ">=11.6.0.43,<=11.7.5.86"
+  - ">=11.6.0.43,<12.0.0"
 
 # `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all
 # architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the
 # "*_run_*" version specifiers correspond to `11.x` packages.
 
 cuda_profiler_api_host_version:
-  - ">=11.8.86,<12"
+  - "=11.8.86"
 
 cuda_profiler_api_run_version:
   - ">=11.4.240,<12"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 42d7e3a900..153fd2129e 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -14,7 +14,7 @@ ucx_version:
   - "1.13.0"
 
 ucx_py_version:
-  - "0.30.*"
+  - "0.31.*"
 
 cmake_version:
   - ">=3.23.1,!=3.25.0"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 784bbbb935..5a89c735bb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -10,8 +10,8 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-set(RAPIDS_VERSION "23.02")
-set(RAFT_VERSION "23.02.00")
+set(RAPIDS_VERSION "23.04")
+set(RAFT_VERSION "23.04.00")
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 include(../fetch_rapids.cmake)
@@ -284,7 +284,18 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/cluster/update_centroids_double.cu
     src/distance/cluster/cluster_cost_float.cu
     src/distance/cluster/cluster_cost_double.cu
-    src/distance/neighbors/refine.cu
+    src/distance/neighbors/refine_d_uint64_t_float.cu
+    src/distance/neighbors/refine_d_uint64_t_int8_t.cu
+    src/distance/neighbors/refine_d_uint64_t_uint8_t.cu
+    src/distance/neighbors/refine_h_uint64_t_float.cu
+    src/distance/neighbors/refine_h_uint64_t_int8_t.cu
+    src/distance/neighbors/refine_h_uint64_t_uint8_t.cu
+    src/distance/neighbors/specializations/refine_d_uint64_t_float.cu
+    src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu
+    src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu
+    src/distance/neighbors/specializations/refine_h_uint64_t_float.cu
+    src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu
+    src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu
     src/distance/neighbors/ivfpq_search.cu
     src/distance/cluster/kmeans_fit_float.cu
     src/distance/cluster/kmeans_fit_double.cu
@@ -665,6 +676,13 @@ raft_export(
   distance distributed nn DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string
 )
 
+# ##################################################################################################
+# * shared test/bench headers ------------------------------------------------
+
+if(BUILD_TESTS OR BUILD_BENCH)
+  include(internal/CMakeLists.txt)
+endif()
+
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 99606dd2e9..1bc2c86243 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -30,6 +30,7 @@ function(ConfigureBench)
   target_link_libraries(
     ${BENCH_NAME}
     PRIVATE raft::raft
+            raft_internal
             $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
             benchmark::benchmark
@@ -81,6 +82,7 @@ if(BUILD_BENCH)
     bench/distance/distance_l1.cu
     bench/distance/distance_unexp_l2.cu
     bench/distance/fused_l2_nn.cu
+    bench/distance/masked_nn.cu
     bench/distance/kernels.cu
     bench/main.cpp
     OPTIONAL
@@ -102,7 +104,10 @@ if(BUILD_BENCH)
     bench/main.cpp
   )
 
-  ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/main.cpp)
+  ConfigureBench(
+    NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/matrix/select_k.cu
+    bench/main.cpp
+  )
 
   ConfigureBench(
     NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu
@@ -126,7 +131,6 @@ if(BUILD_BENCH)
     bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
     bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
     bench/neighbors/refine.cu
-    bench/neighbors/selection.cu
     bench/main.cpp
     OPTIONAL
     DIST
diff --git a/cpp/bench/cluster/kmeans_balanced.cu b/cpp/bench/cluster/kmeans_balanced.cu
index 210b40ced8..9c53e86d8c 100644
--- a/cpp/bench/cluster/kmeans_balanced.cu
+++ b/cpp/bench/cluster/kmeans_balanced.cu
@@ -15,20 +15,19 @@
  */
 
 #include <common/benchmark.hpp>
+#include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/detail/ann_kmeans_balanced.cuh>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/cluster/specializations.cuh>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
 #endif
 
 namespace raft::bench::cluster {
 
 struct KMeansBalancedBenchParams {
   DatasetParams data;
-  uint32_t max_iter;
   uint32_t n_lists;
-  raft::distance::DistanceType metric;
+  raft::cluster::kmeans_balanced_params kb_params;
 };
 
 template <typename T, typename IndexT = int>
@@ -38,15 +37,10 @@ struct KMeansBalanced : public fixture {
   void run_benchmark(::benchmark::State& state) override
   {
     this->loop_on_state(state, [this]() {
-      raft::spatial::knn::detail::kmeans::build_hierarchical<T>(this->handle,
-                                                                this->params.max_iter,
-                                                                (uint32_t)this->params.data.cols,
-                                                                this->X.data_handle(),
-                                                                this->params.data.rows,
-                                                                this->centroids.data_handle(),
-                                                                this->params.n_lists,
-                                                                this->params.metric,
-                                                                this->handle.get_stream());
+      raft::device_matrix_view<const T, IndexT> X_view   = this->X.view();
+      raft::device_matrix_view<T, IndexT> centroids_view = this->centroids.view();
+      raft::cluster::kmeans_balanced::fit(
+        this->handle, this->params.kb_params, X_view, centroids_view);
     });
   }
 
@@ -84,8 +78,8 @@ std::vector<KMeansBalancedBenchParams> getKMeansBalancedInputs()
   std::vector<KMeansBalancedBenchParams> out;
   KMeansBalancedBenchParams p;
   p.data.row_major                          = true;
-  p.max_iter                                = 20;
-  p.metric                                  = raft::distance::DistanceType::L2Expanded;
+  p.kb_params.n_iters                       = 20;
+  p.kb_params.metric                        = raft::distance::DistanceType::L2Expanded;
   std::vector<std::pair<int, int>> row_cols = {
     {100000, 128}, {1000000, 128}, {10000000, 128},
     // The following dataset sizes are too large for most GPUs.
@@ -104,7 +98,5 @@ std::vector<KMeansBalancedBenchParams> getKMeansBalancedInputs()
 
 // Note: the datasets sizes are too large for 32-bit index types.
 RAFT_BENCH_REGISTER((KMeansBalanced<float, int64_t>), "", getKMeansBalancedInputs());
-RAFT_BENCH_REGISTER((KMeansBalanced<int8_t, int64_t>), "", getKMeansBalancedInputs());
-RAFT_BENCH_REGISTER((KMeansBalanced<uint8_t, int64_t>), "", getKMeansBalancedInputs());
 
 }  // namespace raft::bench::cluster
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index 13ca40a033..85d5381e2c 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
 #include <raft/random/make_blobs.cuh>
@@ -110,7 +110,7 @@ class fixture {
   rmm::device_buffer scratch_buf_;
 
  public:
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::cuda_stream_view stream;
 
   fixture() : stream{handle.get_stream()}
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
index 73faacce37..1be00ec0c7 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/distance/distance_common.cuh
@@ -24,14 +24,14 @@
 
 namespace raft::bench::distance {
 
-struct distance_inputs {
+struct distance_params {
   int m, n, k;
   bool isRowMajor;
-};  // struct distance_inputs
+};  // struct distance_params
 
 template <typename T, raft::distance::DistanceType DType>
 struct distance : public fixture {
-  distance(const distance_inputs& p)
+  distance(const distance_params& p)
     : params(p),
       x(p.m * p.k, stream),
       y(p.n * p.k, stream),
@@ -63,13 +63,13 @@ struct distance : public fixture {
   }
 
  private:
-  distance_inputs params;
+  distance_params params;
   rmm::device_uvector<T> x, y, out;
   rmm::device_uvector<char> workspace;
   size_t worksize;
 };  // struct Distance
 
-const std::vector<distance_inputs> dist_input_vecs{
+const std::vector<distance_params> dist_input_vecs{
   {32, 16384, 16384, true},    {64, 16384, 16384, true},    {128, 16384, 16384, true},
   {256, 16384, 16384, true},   {512, 16384, 16384, true},   {1024, 16384, 16384, true},
   {16384, 32, 16384, true},    {16384, 64, 16384, true},    {16384, 128, 16384, true},
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/distance/kernels.cu
index 5c9c2cc2ed..027f93171e 100644
--- a/cpp/bench/distance/kernels.cu
+++ b/cpp/bench/distance/kernels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include <common/benchmark.hpp>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/random/rng.cuh>
@@ -77,7 +77,7 @@ struct GramMatrix : public fixture {
   }
 
  private:
-  const raft::handle_t handle;
+  const raft::device_resources handle;
   std::unique_ptr<GramMatrixBase<T>> kernel;
   GramTestParams params;
 
diff --git a/cpp/bench/distance/masked_nn.cu b/cpp/bench/distance/masked_nn.cu
new file mode 100644
index 0000000000..3677d44864
--- /dev/null
+++ b/cpp/bench/distance/masked_nn.cu
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <common/benchmark.hpp>
+#include <limits>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/distance/masked_nn.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.hpp>
+#endif
+
+namespace raft::bench::distance::masked_nn {
+
+// Introduce various sparsity patterns
+enum AdjacencyPattern {
+  checkerboard    = 0,
+  checkerboard_4  = 1,
+  checkerboard_64 = 2,
+  all_true        = 3,
+  all_false       = 4
+};
+
+struct Params {
+  int m, n, k, num_groups;
+  AdjacencyPattern pattern;
+};  // struct Params
+
+__global__ void init_adj(AdjacencyPattern pattern,
+                         int n,
+                         raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
+                         raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
+{
+  int m          = adj.extent(0);
+  int num_groups = adj.extent(1);
+
+  for (int idx_m = blockIdx.y * blockDim.y + threadIdx.y; idx_m < m;
+       idx_m += blockDim.y * gridDim.y) {
+    for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups;
+         idx_g += blockDim.x * gridDim.x) {
+      switch (pattern) {
+        case checkerboard: adj(idx_m, idx_g) = (idx_m + idx_g) % 2; break;
+        case checkerboard_4: adj(idx_m, idx_g) = (idx_m / 4 + idx_g) % 2; break;
+        case checkerboard_64: adj(idx_m, idx_g) = (idx_m / 64 + idx_g) % 2; break;
+        case all_true: adj(idx_m, idx_g) = true; break;
+        case all_false: adj(idx_m, idx_g) = false; break;
+        default: assert(false && "unknown pattern");
+      }
+    }
+  }
+  // Each group is of size n / num_groups.
+  //
+  // - group_idxs[j] indicates the start of group j + 1 (i.e. is the inclusive
+  // scan of the group lengths)
+  //
+  // - The first group always starts at index zero, so we do not store it.
+  //
+  // - The group_idxs[num_groups - 1] should always equal n.
+
+  if (blockIdx.y == 0 && threadIdx.y == 0) {
+    const int g_stride = blockDim.x * gridDim.x;
+    for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; idx_g += g_stride) {
+      group_idxs(idx_g) = (idx_g + 1) * (n / num_groups);
+    }
+    group_idxs(num_groups - 1) = n;
+  }
+}
+
+template <typename T>
+struct masked_l2_nn : public fixture {
+  using DataT      = T;
+  using IdxT       = int;
+  using OutT       = raft::KeyValuePair<IdxT, DataT>;
+  using RedOpT     = raft::distance::MinAndDistanceReduceOp<int, DataT>;
+  using PairRedOpT = raft::distance::KVPMinReduce<int, DataT>;
+  using ParamT     = raft::distance::MaskedL2NNParams<RedOpT, PairRedOpT>;
+
+  // Parameters
+  Params params;
+  // Data
+  raft::device_vector<OutT, IdxT> out;
+  raft::device_matrix<T, IdxT> x, y;
+  raft::device_vector<DataT, IdxT> xn, yn;
+  raft::device_matrix<bool, IdxT> adj;
+  raft::device_vector<IdxT, IdxT> group_idxs;
+
+  masked_l2_nn(const Params& p)
+    : params(p),
+      out{raft::make_device_vector<OutT, IdxT>(handle, p.m)},
+      x{raft::make_device_matrix<DataT, IdxT>(handle, p.m, p.k)},
+      y{raft::make_device_matrix<DataT, IdxT>(handle, p.n, p.k)},
+      xn{raft::make_device_vector<DataT, IdxT>(handle, p.m)},
+      yn{raft::make_device_vector<DataT, IdxT>(handle, p.n)},
+      adj{raft::make_device_matrix<bool, IdxT>(handle, p.m, p.num_groups)},
+      group_idxs{raft::make_device_vector<IdxT, IdxT>(handle, p.num_groups)}
+  {
+    raft::random::RngState r(123456ULL);
+
+    uniform(handle, r, x.data_handle(), p.m * p.k, T(-1.0), T(1.0));
+    uniform(handle, r, y.data_handle(), p.n * p.k, T(-1.0), T(1.0));
+    raft::linalg::rowNorm(
+      xn.data_handle(), x.data_handle(), p.k, p.m, raft::linalg::L2Norm, true, stream);
+    raft::linalg::rowNorm(
+      yn.data_handle(), y.data_handle(), p.k, p.n, raft::linalg::L2Norm, true, stream);
+    raft::distance::initialize<T, raft::KeyValuePair<int, T>, int>(
+      handle, out.data_handle(), p.m, std::numeric_limits<T>::max(), RedOpT{});
+
+    dim3 block(32, 32);
+    dim3 grid(10, 10);
+    init_adj<<<grid, block, 0, stream>>>(p.pattern, p.n, adj.view(), group_idxs.view());
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    bool init_out = true;
+    bool sqrt     = false;
+    ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, sqrt, init_out};
+
+    loop_on_state(state, [this, masked_l2_params]() {
+      // It is sufficient to only benchmark the L2-squared metric
+      raft::distance::maskedL2NN<DataT, OutT, IdxT>(handle,
+                                                    masked_l2_params,
+                                                    x.view(),
+                                                    y.view(),
+                                                    xn.view(),
+                                                    yn.view(),
+                                                    adj.view(),
+                                                    group_idxs.view(),
+                                                    out.view());
+    });
+
+    // Virtual flop count if no skipping had occurred.
+    size_t virtual_flops = size_t(2) * size_t(params.m) * size_t(params.n) * size_t(params.k);
+
+    int64_t read_elts  = params.n * params.k + params.m * params.k;
+    int64_t write_elts = params.m;
+
+    // Virtual min flops is the number of flops that would have been executed if
+    // the algorithm had actually skipped each computation that it could have
+    // skipped.
+    size_t virtual_min_flops = 0;
+    switch (params.pattern) {
+      case checkerboard:
+      case checkerboard_4:
+      case checkerboard_64: virtual_min_flops = virtual_flops / 2; break;
+      case all_true: virtual_min_flops = virtual_flops; break;
+      case all_false: virtual_min_flops = 0; break;
+      default: assert(false && "unknown pattern");
+    }
+
+    // VFLOP/s is the "virtual" flop count that would have executed if there was
+    // no adjacency pattern. This is useful for comparing to fusedL2NN
+    state.counters["VFLOP/s"] = benchmark::Counter(virtual_flops,
+                                                   benchmark::Counter::kIsIterationInvariantRate,
+                                                   benchmark::Counter::OneK::kIs1000);
+    // Virtual min flops is the number of flops that would have been executed if
+    // the algorithm had actually skipped each computation that it could have
+    // skipped.
+    state.counters["VminFLOP/s"] = benchmark::Counter(virtual_min_flops,
+                                                      benchmark::Counter::kIsIterationInvariantRate,
+                                                      benchmark::Counter::OneK::kIs1000);
+
+    state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT),
+                                                 benchmark::Counter::kIsIterationInvariantRate,
+                                                 benchmark::Counter::OneK::kIs1000);
+    state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT),
+                                                 benchmark::Counter::kIsIterationInvariantRate,
+                                                 benchmark::Counter::OneK::kIs1000);
+
+    state.counters["m"]          = benchmark::Counter(params.m);
+    state.counters["n"]          = benchmark::Counter(params.n);
+    state.counters["k"]          = benchmark::Counter(params.k);
+    state.counters["num_groups"] = benchmark::Counter(params.num_groups);
+    state.counters["group size"] = benchmark::Counter(params.n / params.num_groups);
+    state.counters["Pat"]        = benchmark::Counter(static_cast<int>(params.pattern));
+
+    state.counters["SM count"] = raft::getMultiProcessorCount();
+  }
+};  // struct MaskedL2NN
+
+const std::vector<Params> masked_l2_nn_input_vecs = {
+  // Very fat matrices...
+  {32, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+  {64, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+  {128, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+  {256, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+  {512, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+  {1024, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+  {16384, 32, 16384, 32, AdjacencyPattern::checkerboard},
+  {16384, 64, 16384, 32, AdjacencyPattern::checkerboard},
+  {16384, 128, 16384, 32, AdjacencyPattern::checkerboard},
+  {16384, 256, 16384, 32, AdjacencyPattern::checkerboard},
+  {16384, 512, 16384, 32, AdjacencyPattern::checkerboard},
+  {16384, 1024, 16384, 32, AdjacencyPattern::checkerboard},
+
+  // Representative matrices...
+  {16384, 16384, 32, 32, AdjacencyPattern::checkerboard},
+  {16384, 16384, 64, 32, AdjacencyPattern::checkerboard},
+  {16384, 16384, 128, 32, AdjacencyPattern::checkerboard},
+  {16384, 16384, 256, 32, AdjacencyPattern::checkerboard},
+  {16384, 16384, 512, 32, AdjacencyPattern::checkerboard},
+  {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard},
+  {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard},
+
+  {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_4},
+  {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_4},
+  {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_4},
+  {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_4},
+  {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_4},
+  {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_4},
+  {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_4},
+
+  {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_64},
+  {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_64},
+  {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_64},
+  {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_64},
+  {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_64},
+  {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_64},
+  {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_64},
+
+  {16384, 16384, 32, 32, AdjacencyPattern::all_true},
+  {16384, 16384, 64, 32, AdjacencyPattern::all_true},
+  {16384, 16384, 128, 32, AdjacencyPattern::all_true},
+  {16384, 16384, 256, 32, AdjacencyPattern::all_true},
+  {16384, 16384, 512, 32, AdjacencyPattern::all_true},
+  {16384, 16384, 1024, 32, AdjacencyPattern::all_true},
+  {16384, 16384, 16384, 32, AdjacencyPattern::all_true},
+
+  {16384, 16384, 32, 32, AdjacencyPattern::all_false},
+  {16384, 16384, 64, 32, AdjacencyPattern::all_false},
+  {16384, 16384, 128, 32, AdjacencyPattern::all_false},
+  {16384, 16384, 256, 32, AdjacencyPattern::all_false},
+  {16384, 16384, 512, 32, AdjacencyPattern::all_false},
+  {16384, 16384, 1024, 32, AdjacencyPattern::all_false},
+  {16384, 16384, 16384, 32, AdjacencyPattern::all_false},
+};
+
+RAFT_BENCH_REGISTER(masked_l2_nn<float>, "", masked_l2_nn_input_vecs);
+// We don't benchmark double to keep compile times in check when not using the
+// distance library.
+
+}  // namespace raft::bench::distance::masked_nn
diff --git a/cpp/bench/matrix/argmin.cu b/cpp/bench/matrix/argmin.cu
index 0d0dea0fdb..3869f0c5e1 100644
--- a/cpp/bench/matrix/argmin.cu
+++ b/cpp/bench/matrix/argmin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,11 @@
 #include <common/benchmark.hpp>
 #include <raft/matrix/argmin.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-namespace raft::bench::linalg {
+namespace raft::bench::matrix {
 
 template <typename IdxT>
 struct ArgminParams {
@@ -45,9 +46,7 @@ struct Argmin : public fixture {
   void run_benchmark(::benchmark::State& state) override
   {
     loop_on_state(state, [this]() {
-      auto matrix_const_view = raft::make_device_matrix_view<const T, IdxT, row_major>(
-        matrix.data_handle(), matrix.extent(0), matrix.extent(1));
-      raft::matrix::argmin(handle, matrix_const_view, indices.view());
+      raft::matrix::argmin(handle, raft::make_const_mdspan(matrix.view()), indices.view());
     });
   }
 
@@ -57,15 +56,11 @@ struct Argmin : public fixture {
   raft::device_vector<OutT, IdxT> indices;
 };  // struct Argmin
 
-const std::vector<ArgminParams<int64_t>> argmin_inputs_i64{
-  {1000, 64},     {1000, 128},     {1000, 256},     {1000, 512},     {1000, 1024},
-  {10000, 64},    {10000, 128},    {10000, 256},    {10000, 512},    {10000, 1024},
-  {100000, 64},   {100000, 128},   {100000, 256},   {100000, 512},   {100000, 1024},
-  {1000000, 64},  {1000000, 128},  {1000000, 256},  {1000000, 512},  {1000000, 1024},
-  {10000000, 64}, {10000000, 128}, {10000000, 256}, {10000000, 512}, {10000000, 1024},
-};
+const std::vector<ArgminParams<int64_t>> argmin_inputs_i64 =
+  raft::util::itertools::product<ArgminParams<int64_t>>({1000, 10000, 100000, 1000000, 10000000},
+                                                        {64, 128, 256, 512, 1024});
 
 RAFT_BENCH_REGISTER((Argmin<float, uint32_t, int64_t>), "", argmin_inputs_i64);
 RAFT_BENCH_REGISTER((Argmin<double, uint32_t, int64_t>), "", argmin_inputs_i64);
 
-}  // namespace raft::bench::linalg
+}  // namespace raft::bench::matrix
diff --git a/cpp/bench/matrix/gather.cu b/cpp/bench/matrix/gather.cu
new file mode 100644
index 0000000000..c5d80744cd
--- /dev/null
+++ b/cpp/bench/matrix/gather.cu
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::matrix {
+
+template <typename IdxT>
+struct GatherParams {
+  IdxT rows, cols, map_length;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const GatherParams<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols << "#" << p.map_length;
+  return os;
+}
+
+template <typename T, typename MapT, typename IdxT, bool Conditional = false>
+struct Gather : public fixture {
+  Gather(const GatherParams<IdxT>& p) : params(p) {}
+
+  void allocate_data(const ::benchmark::State& state) override
+  {
+    matrix  = raft::make_device_matrix<T, IdxT>(handle, params.rows, params.cols);
+    map     = raft::make_device_vector<MapT, IdxT>(handle, params.map_length);
+    out     = raft::make_device_matrix<T, IdxT>(handle, params.map_length, params.cols);
+    stencil = raft::make_device_vector<T, IdxT>(handle, Conditional ? params.map_length : IdxT(0));
+
+    raft::random::RngState rng{1234};
+    raft::random::uniform(
+      rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream);
+    raft::random::uniformInt(
+      handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows);
+    if constexpr (Conditional) {
+      raft::random::uniform(rng, stencil.data_handle(), params.map_length, T(-1), T(1), stream);
+    }
+    handle.sync_stream(stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto matrix_const_view = raft::make_const_mdspan(matrix.view());
+      auto map_const_view    = raft::make_const_mdspan(map.view());
+      if constexpr (Conditional) {
+        auto stencil_const_view = raft::make_const_mdspan(stencil.view());
+        auto pred_op            = raft::plug_const_op(T(0.0), raft::greater_op());
+        raft::matrix::gather_if(
+          handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op);
+      } else {
+        raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+      }
+    });
+  }
+
+ private:
+  GatherParams<IdxT> params;
+  raft::device_matrix<T, IdxT> matrix, out;
+  raft::device_vector<T, IdxT> stencil;
+  raft::device_vector<MapT, IdxT> map;
+};  // struct Gather
+
+template <typename T, typename MapT, typename IdxT>
+using GatherIf = Gather<T, MapT, IdxT, true>;
+
+const std::vector<GatherParams<int64_t>> gather_inputs_i64 =
+  raft::util::itertools::product<GatherParams<int64_t>>(
+    {1000000}, {10, 20, 50, 100, 200, 500}, {1000, 10000, 100000, 1000000});
+
+RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "", gather_inputs_i64);
+RAFT_BENCH_REGISTER((Gather<double, uint32_t, int64_t>), "", gather_inputs_i64);
+RAFT_BENCH_REGISTER((GatherIf<float, uint32_t, int64_t>), "", gather_inputs_i64);
+RAFT_BENCH_REGISTER((GatherIf<double, uint32_t, int64_t>), "", gather_inputs_i64);
+}  // namespace raft::bench::matrix
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu
new file mode 100644
index 0000000000..2c8b8bb67b
--- /dev/null
+++ b/cpp/bench/matrix/select_k.cu
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft_internal/matrix/select_k.cuh>
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::matrix {
+
+using namespace raft::bench;  // NOLINT
+
+template <typename KeyT, typename IdxT, select::Algo Algo>
+struct selection : public fixture {
+  explicit selection(const select::params& p)
+    : params_(p),
+      in_dists_(p.batch_size * p.len, stream),
+      in_ids_(p.batch_size * p.len, stream),
+      out_dists_(p.batch_size * p.k, stream),
+      out_ids_(p.batch_size * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream);
+    raft::random::RngState state{42};
+    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
+  }
+
+  void run_benchmark(::benchmark::State& state) override  // NOLINT
+  {
+    device_resources handle{stream};
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this, &handle]() {
+        select::select_k_impl<KeyT, IdxT>(handle,
+                                          Algo,
+                                          in_dists_.data(),
+                                          in_ids_.data(),
+                                          params_.batch_size,
+                                          params_.len,
+                                          params_.k,
+                                          out_dists_.data(),
+                                          out_ids_.data(),
+                                          params_.select_min);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const select::params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<select::params> kInputs{
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, A)                          \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                      \
+  {                                                                \
+    using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
+  }
+
+SELECTION_REGISTER(float, int, kPublicApi);           // NOLINT
+SELECTION_REGISTER(float, int, kRadix8bits);          // NOLINT
+SELECTION_REGISTER(float, int, kRadix11bits);         // NOLINT
+SELECTION_REGISTER(float, int, kWarpAuto);            // NOLINT
+SELECTION_REGISTER(float, int, kWarpImmediate);       // NOLINT
+SELECTION_REGISTER(float, int, kWarpFiltered);        // NOLINT
+SELECTION_REGISTER(float, int, kWarpDistributed);     // NOLINT
+SELECTION_REGISTER(float, int, kWarpDistributedShm);  // NOLINT
+
+SELECTION_REGISTER(double, int, kRadix8bits);   // NOLINT
+SELECTION_REGISTER(double, int, kRadix11bits);  // NOLINT
+SELECTION_REGISTER(double, int, kWarpAuto);     // NOLINT
+
+SELECTION_REGISTER(double, size_t, kRadix8bits);          // NOLINT
+SELECTION_REGISTER(double, size_t, kRadix11bits);         // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpImmediate);       // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpFiltered);        // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpDistributed);     // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpDistributedShm);  // NOLINT
+
+}  // namespace raft::matrix
diff --git a/cpp/bench/neighbors/knn.cuh b/cpp/bench/neighbors/knn.cuh
index d38631b289..eec1cba99e 100644
--- a/cpp/bench/neighbors/knn.cuh
+++ b/cpp/bench/neighbors/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,9 @@
 #include <raft/spatial/knn/specializations.cuh>
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/cluster/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
+#else
+#pragma message("NN / Distance specializations are not enabled; expect very long building times.")
 #endif
 #endif
 
@@ -148,7 +151,7 @@ struct ivf_flat_knn {
   raft::neighbors::ivf_flat::search_params search_params;
   params ps;
 
-  ivf_flat_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  ivf_flat_knn(const raft::device_resources& handle, const params& ps, const ValT* data) : ps(ps)
   {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -156,7 +159,7 @@ struct ivf_flat_knn {
       handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
-  void search(const raft::handle_t& handle,
+  void search(const raft::device_resources& handle,
               const ValT* search_items,
               dist_t* out_dists,
               IdxT* out_idxs)
@@ -176,7 +179,7 @@ struct ivf_pq_knn {
   raft::neighbors::ivf_pq::search_params search_params;
   params ps;
 
-  ivf_pq_knn(const raft::handle_t& handle, const params& ps, const ValT* data) : ps(ps)
+  ivf_pq_knn(const raft::device_resources& handle, const params& ps, const ValT* data) : ps(ps)
   {
     index_params.n_lists = 4096;
     index_params.metric  = raft::distance::DistanceType::L2Expanded;
@@ -184,7 +187,7 @@ struct ivf_pq_knn {
       handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims)));
   }
 
-  void search(const raft::handle_t& handle,
+  void search(const raft::device_resources& handle,
               const ValT* search_items,
               dist_t* out_dists,
               IdxT* out_idxs)
@@ -202,12 +205,12 @@ struct brute_force_knn {
   ValT* index;
   params ps;
 
-  brute_force_knn(const raft::handle_t& handle, const params& ps, const ValT* data)
+  brute_force_knn(const raft::device_resources& handle, const params& ps, const ValT* data)
     : index(const_cast<ValT*>(data)), ps(ps)
   {
   }
 
-  void search(const raft::handle_t& handle,
+  void search(const raft::device_resources& handle,
               const ValT* search_items,
               dist_t* out_dists,
               IdxT* out_idxs)
@@ -287,7 +290,7 @@ struct knn : public fixture {
       std::ostringstream label_stream;
       label_stream << params_ << "#" << strategy_ << "#" << scope_;
       state.SetLabel(label_stream.str());
-      raft::handle_t handle(stream);
+      raft::device_resources handle(stream);
       std::optional<ImplT> index;
 
       if (scope_ == Scope::SEARCH) {  // also implies TransferStrategy::NO_COPY
diff --git a/cpp/bench/neighbors/refine.cu b/cpp/bench/neighbors/refine.cu
index a038905ace..f32af3a57e 100644
--- a/cpp/bench/neighbors/refine.cu
+++ b/cpp/bench/neighbors/refine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,20 @@
  * limitations under the License.
  */
 
-#include <common/benchmark.hpp>
+#include <raft_internal/neighbors/refine_helper.cuh>
 
-#include <raft/random/rng.cuh>
+#include <common/benchmark.hpp>
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/detail/refine.cuh>
 #include <raft/neighbors/refine.cuh>
+#include <raft/random/rng.cuh>
 
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
+#include <raft/neighbors/specializations.cuh>
 #endif
 
 #if defined RAFT_NN_COMPILED
@@ -36,12 +38,10 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-#include "../../test/neighbors/refine_helper.cuh"
-
 #include <iostream>
 #include <sstream>
 
-using namespace raft::neighbors::detail;
+using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
 
@@ -53,7 +53,7 @@ inline auto operator<<(std::ostream& os, const RefineInputs<IdxT>& p) -> std::os
   return os;
 }
 
-RefineInputs<int64_t> p;
+RefineInputs<uint64_t> p;
 
 template <typename DataT, typename DistanceT, typename IdxT>
 class RefineAnn : public fixture {
@@ -95,28 +95,28 @@ class RefineAnn : public fixture {
   }
 
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   RefineHelper<DataT, DistanceT, IdxT> data;
 };
 
-std::vector<RefineInputs<int64_t>> getInputs()
+std::vector<RefineInputs<uint64_t>> getInputs()
 {
-  std::vector<RefineInputs<int64_t>> out;
+  std::vector<RefineInputs<uint64_t>> out;
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
   for (bool host_data : {true, false}) {
-    for (int64_t n_queries : {1000, 10000}) {
-      for (int64_t dim : {128, 512}) {
-        out.push_back(RefineInputs<int64_t>{n_queries, 2000000, dim, 32, 128, metric, host_data});
-        out.push_back(RefineInputs<int64_t>{n_queries, 2000000, dim, 10, 40, metric, host_data});
+    for (uint64_t n_queries : {1000, 10000}) {
+      for (uint64_t dim : {128, 512}) {
+        out.push_back(RefineInputs<uint64_t>{n_queries, 2000000, dim, 32, 128, metric, host_data});
+        out.push_back(RefineInputs<uint64_t>{n_queries, 2000000, dim, 10, 40, metric, host_data});
       }
     }
   }
   return out;
 }
 
-using refine_float_int64 = RefineAnn<float, float, int64_t>;
-RAFT_BENCH_REGISTER(refine_float_int64, "", getInputs());
+using refine_float_uint64 = RefineAnn<float, float, uint64_t>;
+RAFT_BENCH_REGISTER(refine_float_uint64, "", getInputs());
 
-using refine_uint8_int64 = RefineAnn<uint8_t, float, int64_t>;
-RAFT_BENCH_REGISTER(refine_uint8_int64, "", getInputs());
+using refine_uint8_uint64 = RefineAnn<uint8_t, float, uint64_t>;
+RAFT_BENCH_REGISTER(refine_uint8_uint64, "", getInputs());
 }  // namespace raft::bench::neighbors
diff --git a/cpp/bench/neighbors/selection.cu b/cpp/bench/neighbors/selection.cu
deleted file mode 100644
index 1f116c199f..0000000000
--- a/cpp/bench/neighbors/selection.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-#include <raft/spatial/knn/knn.cuh>
-
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
-#include <raft/random/rng.cuh>
-#include <raft/sparse/detail/utils.h>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-namespace raft::bench::spatial {
-
-struct params {
-  int n_inputs;
-  int input_len;
-  int k;
-  int select_min;
-};
-
-template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
-struct selection : public fixture {
-  explicit selection(const params& p)
-    : params_(p),
-      in_dists_(p.n_inputs * p.input_len, stream),
-      in_ids_(p.n_inputs * p.input_len, stream),
-      out_dists_(p.n_inputs * p.k, stream),
-      out_ids_(p.n_inputs * p.k, stream)
-  {
-    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
-    raft::random::RngState state{42};
-    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    using_pool_memory_res res;
-    try {
-      std::ostringstream label_stream;
-      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
-      state.SetLabel(label_stream.str());
-      loop_on_state(state, [this]() {
-        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
-                                                 in_ids_.data(),
-                                                 params_.n_inputs,
-                                                 params_.input_len,
-                                                 out_dists_.data(),
-                                                 out_ids_.data(),
-                                                 params_.select_min,
-                                                 params_.k,
-                                                 stream,
-                                                 Algo);
-      });
-    } catch (raft::exception& e) {
-      state.SkipWithError(e.what());
-    }
-  }
-
- private:
-  const params params_;
-  rmm::device_uvector<KeyT> in_dists_, out_dists_;
-  rmm::device_uvector<IdxT> in_ids_, out_ids_;
-};
-
-const std::vector<params> kInputs{
-  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
-  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
-  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
-
-  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
-  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
-  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
-
-  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
-  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
-  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
-
-  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
-  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
-  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
-};
-
-#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
-  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
-  {                                                                               \
-    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
-    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
-  }
-
-SELECTION_REGISTER(float, int, FAISS);
-SELECTION_REGISTER(float, int, RADIX_8_BITS);
-SELECTION_REGISTER(float, int, RADIX_11_BITS);
-SELECTION_REGISTER(float, int, WARP_SORT);
-
-SELECTION_REGISTER(double, int, FAISS);
-SELECTION_REGISTER(double, int, RADIX_8_BITS);
-SELECTION_REGISTER(double, int, RADIX_11_BITS);
-SELECTION_REGISTER(double, int, WARP_SORT);
-
-SELECTION_REGISTER(double, size_t, FAISS);
-SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
-SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
-SELECTION_REGISTER(double, size_t, WARP_SORT);
-
-}  // namespace raft::bench::spatial
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/random/permute.cu
index 5364bb44e3..cb9e21868b 100644
--- a/cpp/bench/random/permute.cu
+++ b/cpp/bench/random/permute.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ struct permute : public fixture {
   }
 
  private:
-  raft::handle_t handle;
+  raft::device_resources handle;
   permute_inputs params;
   rmm::device_uvector<T> out, in;
   rmm::device_uvector<int> perms;
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/sparse/convert_csr.cu
index 830fab13cc..c9dcae6985 100644
--- a/cpp/bench/sparse/convert_csr.cu
+++ b/cpp/bench/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,7 +107,7 @@ struct bench_base : public fixture {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   bench_param<index_t> params;
   rmm::device_uvector<bool> adj;
   rmm::device_uvector<index_t> row_ind;
diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake
index 811a5466c3..3e02ce064e 100644
--- a/cpp/cmake/thirdparty/get_cutlass.cmake
+++ b/cpp/cmake/thirdparty/get_cutlass.cmake
@@ -30,6 +30,10 @@ function(find_and_configure_cutlass)
       CACHE BOOL "Disable CUTLASS to build with cuBLAS library."
   )
 
+  if (CUDA_STATIC_RUNTIME)
+    set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE)
+  endif()
+
   rapids_cpm_find(
     NvidiaCutlass ${PKG_VERSION}
     GLOBAL_TARGETS nvidia::cutlass::cutlass
diff --git a/cpp/include/raft/cluster/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh
index 618f852bba..f4b2ecf051 100644
--- a/cpp/include/raft/cluster/detail/agglomerative.cuh
+++ b/cpp/include/raft/cluster/detail/agglomerative.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -100,7 +100,7 @@ class UnionFind {
  * @param[out] out_size cluster sizes of output
  */
 template <typename value_idx, typename value_t>
-void build_dendrogram_host(const handle_t& handle,
+void build_dendrogram_host(raft::device_resources const& handle,
                            const value_idx* rows,
                            const value_idx* cols,
                            const value_t* data,
@@ -236,7 +236,7 @@ struct init_label_roots {
  * @param n_leaves
  */
 template <typename value_idx, int tpb = 256>
-void extract_flattened_clusters(const raft::handle_t& handle,
+void extract_flattened_clusters(raft::device_resources const& handle,
                                 value_idx* labels,
                                 const value_idx* children,
                                 size_t n_clusters,
diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index a07045f0d2..163670f29a 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -24,6 +24,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/cluster/single_linkage_types.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -39,7 +40,7 @@ namespace raft::cluster::detail {
 
 template <raft::cluster::LinkageDistance dist_type, typename value_idx, typename value_t>
 struct distance_graph_impl {
-  void run(const raft::handle_t& handle,
+  void run(raft::device_resources const& handle,
            const value_t* X,
            size_t m,
            size_t n,
@@ -57,7 +58,7 @@ struct distance_graph_impl {
  */
 template <typename value_idx, typename value_t>
 struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
-  void run(const raft::handle_t& handle,
+  void run(raft::device_resources const& handle,
            const value_t* X,
            size_t m,
            size_t n,
@@ -103,6 +104,98 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
   }
 };
 
+template <typename value_idx>
+__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz)
+{
+  value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid >= nnz) return;
+  value_idx v  = tid % m;
+  indices[tid] = v;
+}
+
+/**
+ * Compute connected CSR of pairwise distances
+ * @tparam value_idx
+ * @tparam value_t
+ * @param handle
+ * @param X
+ * @param m
+ * @param n
+ * @param metric
+ * @param[out] indptr
+ * @param[out] indices
+ * @param[out] data
+ */
+template <typename value_idx, typename value_t>
+void pairwise_distances(const raft::device_resources& handle,
+                        const value_t* X,
+                        size_t m,
+                        size_t n,
+                        raft::distance::DistanceType metric,
+                        value_idx* indptr,
+                        value_idx* indices,
+                        value_t* data)
+{
+  auto stream      = handle.get_stream();
+  auto exec_policy = handle.get_thrust_policy();
+
+  value_idx nnz = m * m;
+
+  value_idx blocks = raft::ceildiv(nnz, (value_idx)256);
+  fill_indices2<value_idx><<<blocks, 256, 0, stream>>>(indices, m, nnz);
+
+  thrust::sequence(exec_policy, indptr, indptr + m, 0, (int)m);
+
+  raft::update_device(indptr + m, &nnz, 1, stream);
+
+  // TODO: It would ultimately be nice if the MST could accept
+  // dense inputs directly so we don't need to double the memory
+  // usage to hand it a sparse array here.
+  distance::pairwise_distance<value_t, value_idx>(handle, X, X, data, m, m, n, metric);
+  // self-loops get max distance
+  auto transform_in =
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator(0), data));
+
+  thrust::transform(exec_policy,
+                    transform_in,
+                    transform_in + nnz,
+                    data,
+                    [=] __device__(const thrust::tuple<value_idx, value_t>& tup) {
+                      value_idx idx  = thrust::get<0>(tup);
+                      bool self_loop = idx % m == idx / m;
+                      return (self_loop * std::numeric_limits<value_t>::max()) +
+                             (!self_loop * thrust::get<1>(tup));
+                    });
+}
+
+/**
+ * Connectivities specialization for pairwise distances
+ * @tparam value_idx
+ * @tparam value_t
+ */
+template <typename value_idx, typename value_t>
+struct distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, value_idx, value_t> {
+  void run(const raft::device_resources& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
+           raft::distance::DistanceType metric,
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c)
+  {
+    auto stream = handle.get_stream();
+
+    size_t nnz = m * m;
+
+    indices.resize(nnz, stream);
+    data.resize(nnz, stream);
+
+    pairwise_distances(handle, X, m, n, metric, indptr.data(), indices.data(), data.data());
+  }
+};
+
 /**
  * Returns a CSR connectivities graph based on the given linkage distance.
  * @tparam value_idx
@@ -120,7 +213,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
  *             which will guarantee k <= log(n) + c
  */
 template <typename value_idx, typename value_t, raft::cluster::LinkageDistance dist_type>
-void get_distance_graph(const raft::handle_t& handle,
+void get_distance_graph(raft::device_resources const& handle,
                         const value_t* X,
                         size_t m,
                         size_t n,
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index e575849536..9632fedb9d 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
 #include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/logger.hpp>
@@ -59,7 +59,7 @@ namespace detail {
 
 // Selects 'n_clusters' samples randomly from X
 template <typename DataT, typename IndexT>
-void initRandom(const raft::handle_t& handle,
+void initRandom(raft::device_resources const& handle,
                 const KMeansParams& params,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 raft::device_matrix_view<DataT, IndexT> centroids)
@@ -85,7 +85,7 @@ void initRandom(const raft::handle_t& handle,
  * 5: end for
  */
 template <typename DataT, typename IndexT>
-void kmeansPlusPlus(const raft::handle_t& handle,
+void kmeansPlusPlus(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
@@ -282,7 +282,7 @@ void kmeansPlusPlus(const raft::handle_t& handle,
  * @param[inout] workspace
  */
 template <typename DataT, typename IndexT, typename LabelsIterator>
-void update_centroids(const raft::handle_t& handle,
+void update_centroids(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT, row_major> X,
                       raft::device_vector_view<const DataT, IndexT> sample_weights,
                       raft::device_matrix_view<const DataT, IndexT, row_major> centroids,
@@ -356,7 +356,7 @@ void update_centroids(const raft::handle_t& handle,
 
 // TODO: Resizing is needed to use mdarray instead of rmm::device_uvector
 template <typename DataT, typename IndexT>
-void kmeans_fit_main(const raft::handle_t& handle,
+void kmeans_fit_main(raft::device_resources const& handle,
                      const KMeansParams& params,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<const DataT, IndexT> weight,
@@ -573,7 +573,7 @@ void kmeans_fit_main(const raft::handle_t& handle,
 
  */
 template <typename DataT, typename IndexT>
-void initScalableKMeansPlusPlus(const raft::handle_t& handle,
+void initScalableKMeansPlusPlus(raft::device_resources const& handle,
                                 const KMeansParams& params,
                                 raft::device_matrix_view<const DataT, IndexT> X,
                                 raft::device_matrix_view<DataT, IndexT> centroidsRawData,
@@ -816,7 +816,7 @@ void initScalableKMeansPlusPlus(const raft::handle_t& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 raft::device_matrix_view<const DataT> X,
                 std::optional<raft::device_vector_view<const DataT>> sample_weight,
@@ -955,7 +955,7 @@ void kmeans_fit(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 const DataT* X,
                 const DataT* sample_weight,
@@ -980,7 +980,7 @@ void kmeans_fit(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT> X,
                     std::optional<raft::device_vector_view<const DataT>> sample_weight,
@@ -1088,7 +1088,7 @@ void kmeans_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     const DataT* X,
                     const DataT* sample_weight,
@@ -1120,7 +1120,7 @@ void kmeans_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         raft::device_matrix_view<const DataT> X,
                         std::optional<raft::device_vector_view<const DataT>> sample_weight,
@@ -1147,7 +1147,7 @@ void kmeans_fit_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         const DataT* X,
                         const DataT* sample_weight,
@@ -1187,7 +1187,7 @@ void kmeans_fit_predict(handle_t const& handle,
  * @param[out]    X_new         X transformed in the new space..
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       raft::device_matrix_view<const DataT> X,
                       raft::device_matrix_view<const DataT> centroids,
@@ -1228,7 +1228,7 @@ void kmeans_transform(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       const DataT* X,
                       const DataT* centroids,
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
new file mode 100644
index 0000000000..3d23c809c3
--- /dev/null
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -0,0 +1,1095 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits>
+#include <type_traits>
+
+#include <raft/cluster/detail/kmeans_common.cuh>
+#include <raft/cluster/kmeans_balanced_types.hpp>
+#include <raft/common/nvtx.hpp>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/normalize.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/argmin.cuh>
+#include <raft/matrix/gather.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <thrust/gather.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+namespace raft::cluster::detail {
+
+constexpr static inline float kAdjustCentersWeight = 7.0f;
+
+/**
+ * @brief Predict labels for the dataset; floating-point types only.
+ *
+ * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
+ * * n_cluster * sizeof(MathT)).
+ *
+ * @tparam MathT  type of the centroids and mapped data
+ * @tparam IdxT   index type
+ * @tparam LabelT label type
+ *
+ * @param[in] handle The raft handle.
+ * @param[in] params Structure containing the hyper-parameters
+ * @param[in] centers Pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param[in] n_clusters Number of clusters/centers
+ * @param[in] dim Dimensionality of the data
+ * @param[in] dataset Pointer to the data [n_rows, dim]
+ * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
+ * @param[in] n_rows Number samples in the `dataset`
+ * @param[out] labels Output predictions [n_rows]
+ * @param[inout] mr (optional) Memory resource to use for temporary allocations
+ */
+template <typename MathT, typename IdxT, typename LabelT>
+inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
+  const raft::device_resources& handle,
+  const kmeans_balanced_params& params,
+  const MathT* centers,
+  IdxT n_clusters,
+  IdxT dim,
+  const MathT* dataset,
+  const MathT* dataset_norm,
+  IdxT n_rows,
+  LabelT* labels,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto stream = handle.get_stream();
+  switch (params.metric) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2SqrtExpanded: {
+      auto workspace = raft::make_device_mdarray<char, IdxT>(
+        handle, mr, make_extents<IdxT>((sizeof(int)) * n_rows));
+
+      auto minClusterAndDistance = raft::make_device_mdarray<raft::KeyValuePair<IdxT, MathT>, IdxT>(
+        handle, mr, make_extents<IdxT>(n_rows));
+      raft::KeyValuePair<IdxT, MathT> initial_value(0, std::numeric_limits<MathT>::max());
+      thrust::fill(handle.get_thrust_policy(),
+                   minClusterAndDistance.data_handle(),
+                   minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
+                   initial_value);
+
+      auto centroidsNorm =
+        raft::make_device_mdarray<MathT, IdxT>(handle, mr, make_extents<IdxT>(n_clusters));
+      raft::linalg::rowNorm<MathT, IdxT>(
+        centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream);
+
+      raft::distance::fusedL2NNMinReduce<MathT, raft::KeyValuePair<IdxT, MathT>, IdxT>(
+        minClusterAndDistance.data_handle(),
+        dataset,
+        centers,
+        dataset_norm,
+        centroidsNorm.data_handle(),
+        n_rows,
+        n_clusters,
+        dim,
+        (void*)workspace.data_handle(),
+        (params.metric == raft::distance::DistanceType::L2Expanded) ? false : true,
+        false,
+        stream);
+
+      // todo(lsugy): use KVP + iterator in caller.
+      // Copy keys to output labels
+      thrust::transform(handle.get_thrust_policy(),
+                        minClusterAndDistance.data_handle(),
+                        minClusterAndDistance.data_handle() + n_rows,
+                        labels,
+                        raft::compose_op<raft::cast_op<LabelT>, raft::key_op>());
+      break;
+    }
+    case raft::distance::DistanceType::InnerProduct: {
+      // TODO: pass buffer
+      rmm::device_uvector<MathT> distances(n_rows * n_clusters, stream, mr);
+
+      MathT alpha = -1.0;
+      MathT beta  = 0.0;
+
+      linalg::gemm(handle,
+                   true,
+                   false,
+                   n_clusters,
+                   n_rows,
+                   dim,
+                   &alpha,
+                   centers,
+                   dim,
+                   dataset,
+                   dim,
+                   &beta,
+                   distances.data(),
+                   n_clusters,
+                   stream);
+
+      auto distances_const_view = raft::make_device_matrix_view<const MathT, IdxT, row_major>(
+        distances.data(), n_rows, n_clusters);
+      auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
+      raft::matrix::argmin(handle, distances_const_view, labels_view);
+      break;
+    }
+    default: {
+      RAFT_FAIL("The chosen distance metric is not supported (%d)", int(params.metric));
+    }
+  }
+}
+
+/**
+ * @brief Suggest a minibatch size for kmeans prediction.
+ *
+ * This function is used as a heuristic to split the work over a large dataset
+ * to reduce the size of temporary memory allocations.
+ *
+ * @tparam MathT type of the centroids and mapped data
+ * @tparam IdxT  index type
+ *
+ * @param[in] n_clusters number of clusters in kmeans clustering
+ * @param[in] n_rows Number of samples in the dataset
+ * @param[in] dim Number of features in the dataset
+ * @param[in] metric Distance metric
+ * @param[in] needs_conversion Whether the data needs to be converted to MathT
+ * @return A suggested minibatch size and the expected memory cost per-row (in bytes)
+ */
+template <typename MathT, typename IdxT>
+constexpr auto calc_minibatch_size(IdxT n_clusters,
+                                   IdxT n_rows,
+                                   IdxT dim,
+                                   raft::distance::DistanceType metric,
+                                   bool needs_conversion) -> std::tuple<IdxT, size_t>
+{
+  n_clusters = std::max<IdxT>(1, n_clusters);
+
+  // Estimate memory needs per row (i.e element of the batch).
+  size_t mem_per_row = 0;
+  switch (metric) {
+    // fusedL2NN needs a mutex and a key-value pair for each row.
+    case distance::DistanceType::L2Expanded:
+    case distance::DistanceType::L2SqrtExpanded: {
+      mem_per_row += sizeof(int);
+      mem_per_row += sizeof(raft::KeyValuePair<IdxT, MathT>);
+    } break;
+    // Other metrics require storing a distance matrix.
+    default: {
+      mem_per_row += sizeof(MathT) * n_clusters;
+    }
+  }
+
+  // If we need to convert to MathT, space required for the converted batch.
+  if (!needs_conversion) { mem_per_row += sizeof(MathT) * dim; }
+
+  // Heuristic: calculate the minibatch size in order to use at most 1GB of memory.
+  IdxT minibatch_size = (1 << 30) / mem_per_row;
+  minibatch_size      = 64 * div_rounding_up_safe(minibatch_size, IdxT{64});
+  minibatch_size      = std::min<IdxT>(minibatch_size, n_rows);
+  return std::make_tuple(minibatch_size, mem_per_row);
+}
+
+/**
+ * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
+ *
+ * @note all pointers must be accessible on the device.
+ *
+ * @tparam T          element type
+ * @tparam MathT      type of the centroids and mapped data
+ * @tparam IdxT       index type
+ * @tparam LabelT     label type
+ * @tparam CounterT   counter type supported by CUDA's native atomicAdd
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle.
+ * @param[inout] centers Pointer to the output [n_clusters, dim]
+ * @param[inout] cluster_sizes Number of rows in each cluster [n_clusters]
+ * @param[in] n_clusters Number of clusters/centers
+ * @param[in] dim Dimensionality of the data
+ * @param[in] dataset Pointer to the data [n_rows, dim]
+ * @param[in] n_rows Number of samples in the `dataset`
+ * @param[in] labels Output predictions [n_rows]
+ * @param[in] reset_counters Whether to clear the output arrays before calculating.
+ *    When set to `false`, this function may be used to update existing centers and sizes using
+ *    the weighted average principle.
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[inout] mr (optional) Memory resource to use for temporary allocations on the device
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void calc_centers_and_sizes(const raft::device_resources& handle,
+                            MathT* centers,
+                            CounterT* cluster_sizes,
+                            IdxT n_clusters,
+                            IdxT dim,
+                            const T* dataset,
+                            IdxT n_rows,
+                            const LabelT* labels,
+                            bool reset_counters,
+                            MappingOpT mapping_op,
+                            rmm::mr::device_memory_resource* mr = nullptr)
+{
+  auto stream = handle.get_stream();
+  if (mr == nullptr) { mr = handle.get_workspace_resource(); }
+
+  if (!reset_counters) {
+    raft::linalg::matrixVectorOp(
+      centers, centers, cluster_sizes, dim, n_clusters, true, false, raft::mul_op(), stream);
+  }
+
+  rmm::device_uvector<char> workspace(0, stream, mr);
+
+  // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
+  // If we don't reset, we compute in a temporary buffer and add in a separate step.
+  rmm::device_uvector<CounterT> temp_cluster_sizes(0, stream, mr);
+  CounterT* temp_sizes = cluster_sizes;
+  if (!reset_counters) {
+    temp_cluster_sizes.resize(n_clusters, stream);
+    temp_sizes = temp_cluster_sizes.data();
+  }
+
+  // Apply mapping only when the data and math types are different.
+  if constexpr (std::is_same_v<T, MathT>) {
+    raft::linalg::reduce_rows_by_key(
+      dataset, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
+  } else {
+    // todo(lsugy): use iterator from KV output of fusedL2NN
+    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset, mapping_op);
+    raft::linalg::reduce_rows_by_key(
+      mapping_itr, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
+  }
+
+  // Compute weight of each cluster
+  raft::cluster::detail::countLabels(handle, labels, temp_sizes, n_rows, n_clusters, workspace);
+
+  // Add previous sizes if necessary
+  if (!reset_counters) {
+    raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream);
+  }
+
+  raft::linalg::matrixVectorOp(centers,
+                               centers,
+                               cluster_sizes,
+                               dim,
+                               n_clusters,
+                               true,
+                               false,
+                               raft::div_checkzero_op(),
+                               stream);
+}
+
+/** Computes the L2 norm of the dataset, converting to MathT if necessary */
+template <typename T, typename MathT, typename IdxT, typename MappingOpT>
+void compute_norm(const raft::device_resources& handle,
+                  MathT* dataset_norm,
+                  const T* dataset,
+                  IdxT dim,
+                  IdxT n_rows,
+                  MappingOpT mapping_op,
+                  rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("compute_norm");
+  auto stream = handle.get_stream();
+  if (mr == nullptr) { mr = handle.get_workspace_resource(); }
+  rmm::device_uvector<MathT> mapped_dataset(0, stream, mr);
+
+  const MathT* dataset_ptr = nullptr;
+
+  if (std::is_same_v<MathT, T>) {
+    dataset_ptr = reinterpret_cast<const MathT*>(dataset);
+  } else {
+    mapped_dataset.resize(n_rows * dim, stream);
+
+    linalg::unaryOp(mapped_dataset.data(), dataset, n_rows * dim, mapping_op, stream);
+
+    dataset_ptr = (const MathT*)mapped_dataset.data();
+  }
+
+  raft::linalg::rowNorm<MathT, IdxT>(
+    dataset_norm, dataset_ptr, dim, n_rows, raft::linalg::L2Norm, true, stream);
+}
+
+/**
+ * @brief Predict labels for the dataset.
+ *
+ * @tparam T element type
+ * @tparam MathT type of the centroids and mapped data
+ * @tparam IdxT index type
+ * @tparam LabelT label type
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle
+ * @param[in] params Structure containing the hyper-parameters
+ * @param[in] centers Pointer to the row-major matrix of cluster centers [n_clusters, dim]
+ * @param[in] n_clusters Number of clusters/centers
+ * @param[in] dim Dimensionality of the data
+ * @param[in] dataset Pointer to the data [n_rows, dim]
+ * @param[in] n_rows Number samples in the `dataset`
+ * @param[out] labels Output predictions [n_rows]
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[inout] mr (optional) memory resource to use for temporary allocations
+ * @param[in] dataset_norm (optional) Pre-computed norms of each row in the dataset [n_rows]
+ */
+template <typename T, typename MathT, typename IdxT, typename LabelT, typename MappingOpT>
+void predict(const raft::device_resources& handle,
+             const kmeans_balanced_params& params,
+             const MathT* centers,
+             IdxT n_clusters,
+             IdxT dim,
+             const T* dataset,
+             IdxT n_rows,
+             LabelT* labels,
+             MappingOpT mapping_op,
+             rmm::mr::device_memory_resource* mr = nullptr,
+             const MathT* dataset_norm           = nullptr)
+{
+  auto stream = handle.get_stream();
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
+  if (mr == nullptr) { mr = handle.get_workspace_resource(); }
+  auto [max_minibatch_size, _mem_per_row] =
+    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  rmm::device_uvector<MathT> cur_dataset(
+    std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mr);
+  bool need_compute_norm =
+    dataset_norm == nullptr && (params.metric == raft::distance::DistanceType::L2Expanded ||
+                                params.metric == raft::distance::DistanceType::L2SqrtExpanded);
+  rmm::device_uvector<MathT> cur_dataset_norm(
+    need_compute_norm ? max_minibatch_size : 0, stream, mr);
+  const MathT* dataset_norm_ptr = nullptr;
+  auto cur_dataset_ptr          = cur_dataset.data();
+  for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
+    IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
+
+    if constexpr (std::is_same_v<T, MathT>) {
+      cur_dataset_ptr = const_cast<MathT*>(dataset + offset * dim);
+    } else {
+      linalg::unaryOp(
+        cur_dataset_ptr, dataset + offset * dim, minibatch_size * dim, mapping_op, stream);
+    }
+
+    // Compute the norm now if it hasn't been pre-computed.
+    if (need_compute_norm) {
+      compute_norm(
+        handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mr);
+      dataset_norm_ptr = cur_dataset_norm.data();
+    } else if (dataset_norm != nullptr) {
+      dataset_norm_ptr = dataset_norm + offset;
+    }
+
+    predict_core(handle,
+                 params,
+                 centers,
+                 n_clusters,
+                 dim,
+                 cur_dataset_ptr,
+                 dataset_norm_ptr,
+                 minibatch_size,
+                 labels + offset,
+                 mr);
+  }
+}
+
+template <uint32_t BlockDimY,
+          typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+__global__ void __launch_bounds__((WarpSize * BlockDimY))
+  adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
+                        IdxT n_clusters,
+                        IdxT dim,
+                        const T* dataset,  // [n_rows, dim]
+                        IdxT n_rows,
+                        const LabelT* labels,           // [n_rows]
+                        const CounterT* cluster_sizes,  // [n_clusters]
+                        MathT threshold,
+                        IdxT average,
+                        IdxT seed,
+                        IdxT* count,
+                        MappingOpT mapping_op)
+{
+  IdxT l = threadIdx.y + BlockDimY * static_cast<IdxT>(blockIdx.y);
+  if (l >= n_clusters) return;
+  auto csize = static_cast<IdxT>(cluster_sizes[l]);
+  // skip big clusters
+  if (csize > static_cast<IdxT>(average * threshold)) return;
+
+  // choose a "random" i that belongs to a rather large cluster
+  IdxT i;
+  IdxT j = laneId();
+  if (j == 0) {
+    do {
+      auto old = atomicAdd(count, IdxT{1});
+      i        = (seed * (old + 1)) % n_rows;
+    } while (static_cast<IdxT>(cluster_sizes[labels[i]]) < average);
+  }
+  i = raft::shfl(i, 0);
+
+  // Adjust the center of the selected smaller cluster to gravitate towards
+  // a sample from the selected larger cluster.
+  const IdxT li = static_cast<IdxT>(labels[i]);
+  // Weight of the current center for the weighted average.
+  // We dump it for anomalously small clusters, but keep constant otherwise.
+  const MathT wc = min(static_cast<MathT>(csize), static_cast<MathT>(kAdjustCentersWeight));
+  // Weight for the datapoint used to shift the center.
+  const MathT wd = 1.0;
+  for (; j < dim; j += WarpSize) {
+    MathT val = 0;
+    val += wc * centers[j + dim * li];
+    val += wd * mapping_op(dataset[j + dim * i]);
+    val /= wc + wd;
+    centers[j + dim * l] = val;
+  }
+}
+
+/**
+ * @brief Adjust centers for clusters that have small number of entries.
+ *
+ * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
+ * towards a data point that belongs to a large cluster.
+ *
+ * NB: if this function returns `true`, you should update the labels.
+ *
+ * NB: all pointers must be on the device side.
+ *
+ * @tparam T element type
+ * @tparam MathT type of the centroids and mapped data
+ * @tparam IdxT index type
+ * @tparam LabelT label type
+ * @tparam CounterT counter type supported by CUDA's native atomicAdd
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[inout] centers cluster centers [n_clusters, dim]
+ * @param[in] n_clusters number of rows in `centers`
+ * @param[in] dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
+ * @param[in] n_rows number of rows in `dataset`
+ * @param[in] labels a host pointer to the cluster indices [n_rows]
+ * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
+ * @param[in] threshold defines a criterion for adjusting a cluster
+ *                   (cluster_sizes <= average_size * threshold)
+ *                   0 <= threshold < 1
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[in] stream CUDA stream
+ * @param[inout] device_memory  memory resource to use for temporary allocations
+ *
+ * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated).
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+auto adjust_centers(MathT* centers,
+                    IdxT n_clusters,
+                    IdxT dim,
+                    const T* dataset,
+                    IdxT n_rows,
+                    const LabelT* labels,
+                    const CounterT* cluster_sizes,
+                    MathT threshold,
+                    MappingOpT mapping_op,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* device_memory) -> bool
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
+  if (n_clusters == 0) { return false; }
+  constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
+                                      601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
+                                      1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
+                                      2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
+  static IdxT i        = 0;
+  static IdxT i_primes = 0;
+
+  bool adjusted = false;
+  IdxT average  = n_rows / n_clusters;
+  IdxT ofst;
+  do {
+    i_primes = (i_primes + 1) % kPrimes.size();
+    ofst     = kPrimes[i_primes];
+  } while (n_rows % ofst == 0);
+
+  constexpr uint32_t kBlockDimY = 4;
+  const dim3 block_dim(WarpSize, kBlockDimY, 1);
+  const dim3 grid_dim(1, raft::ceildiv(n_clusters, static_cast<IdxT>(kBlockDimY)), 1);
+  rmm::device_scalar<IdxT> update_count(0, stream, device_memory);
+  adjust_centers_kernel<kBlockDimY><<<grid_dim, block_dim, 0, stream>>>(centers,
+                                                                        n_clusters,
+                                                                        dim,
+                                                                        dataset,
+                                                                        n_rows,
+                                                                        labels,
+                                                                        cluster_sizes,
+                                                                        threshold,
+                                                                        average,
+                                                                        ofst,
+                                                                        update_count.data(),
+                                                                        mapping_op);
+  adjusted = update_count.value(stream) > 0;  // NB: rmm scalar performs the sync
+
+  return adjusted;
+}
+
+/**
+ * @brief Expectation-maximization-balancing combined in an iterative process.
+ *
+ * Note, the `cluster_centers` is assumed to be already initialized here.
+ * Thus, this function can be used for fine-tuning existing clusters;
+ * to train from scratch, use `build_clusters` function below.
+ *
+ * @tparam T      element type
+ * @tparam MathT  type of the centroids and mapped data
+ * @tparam IdxT   index type
+ * @tparam LabelT label type
+ * @tparam CounterT counter type supported by CUDA's native atomicAdd
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle
+ * @param[in] params Structure containing the hyper-parameters
+ * @param[in] n_iters Requested number of iterations (can differ from params.n_iter!)
+ * @param[in] dim Dimensionality of the dataset
+ * @param[in] dataset Pointer to a managed row-major array [n_rows, dim]
+ * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows]
+ * @param[in] n_rows Number of rows in the dataset
+ * @param[in] n_cluster Requested number of clusters
+ * @param[inout] cluster_centers Pointer to a managed row-major array [n_clusters, dim]
+ * @param[out] cluster_labels Pointer to a managed row-major array [n_rows]
+ * @param[out] cluster_sizes Pointer to a managed row-major array [n_clusters]
+ * @param[in] balancing_pullback
+ *   if the cluster centers are rebalanced on this number of iterations,
+ *   one extra iteration is performed (this could happen several times) (default should be `2`).
+ *   In other words, the first and then every `ballancing_pullback`-th rebalancing operation adds
+ *   one more iteration to the main cycle.
+ * @param[in] balancing_threshold
+ *   the rebalancing takes place if any cluster is smaller than `avg_size * balancing_threshold`
+ *   on a given iteration (default should be `~ 0.25`).
+ * @param[in] mapping_op Mapping operation from T to MathT
+ * @param[inout] device_memory
+ *   A memory resource for device allocations (makes sense to provide a memory pool here)
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void balancing_em_iters(const raft::device_resources& handle,
+                        const kmeans_balanced_params& params,
+                        uint32_t n_iters,
+                        IdxT dim,
+                        const T* dataset,
+                        const MathT* dataset_norm,
+                        IdxT n_rows,
+                        IdxT n_clusters,
+                        MathT* cluster_centers,
+                        LabelT* cluster_labels,
+                        CounterT* cluster_sizes,
+                        uint32_t balancing_pullback,
+                        MathT balancing_threshold,
+                        MappingOpT mapping_op,
+                        rmm::mr::device_memory_resource* device_memory)
+{
+  auto stream                = handle.get_stream();
+  uint32_t balancing_counter = balancing_pullback;
+  for (uint32_t iter = 0; iter < n_iters; iter++) {
+    // Balancing step - move the centers around to equalize cluster sizes
+    // (but not on the first iteration)
+    if (iter > 0 && adjust_centers(cluster_centers,
+                                   n_clusters,
+                                   dim,
+                                   dataset,
+                                   n_rows,
+                                   cluster_labels,
+                                   cluster_sizes,
+                                   balancing_threshold,
+                                   mapping_op,
+                                   stream,
+                                   device_memory)) {
+      if (balancing_counter++ >= balancing_pullback) {
+        balancing_counter -= balancing_pullback;
+        n_iters++;
+      }
+    }
+    switch (params.metric) {
+      // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
+      // To avoid converging to zero, we normalize the center vectors on every iteration.
+      case raft::distance::DistanceType::InnerProduct:
+      case raft::distance::DistanceType::CosineExpanded:
+      case raft::distance::DistanceType::CorrelationExpanded: {
+        auto clusters_in_view = raft::make_device_matrix_view<const MathT, IdxT, raft::row_major>(
+          cluster_centers, n_clusters, dim);
+        auto clusters_out_view = raft::make_device_matrix_view<MathT, IdxT, raft::row_major>(
+          cluster_centers, n_clusters, dim);
+        raft::linalg::row_normalize(
+          handle, clusters_in_view, clusters_out_view, raft::linalg::L2Norm);
+        break;
+      }
+      default: break;
+    }
+    // E: Expectation step - predict labels
+    predict(handle,
+            params,
+            cluster_centers,
+            n_clusters,
+            dim,
+            dataset,
+            n_rows,
+            cluster_labels,
+            mapping_op,
+            device_memory,
+            dataset_norm);
+    // M: Maximization step - calculate optimal cluster centers
+    calc_centers_and_sizes(handle,
+                           cluster_centers,
+                           cluster_sizes,
+                           n_clusters,
+                           dim,
+                           dataset,
+                           n_rows,
+                           cluster_labels,
+                           true,
+                           mapping_op,
+                           device_memory);
+  }
+}
+
+/** Randomly initialize cluster centers and then call `balancing_em_iters`. */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void build_clusters(const raft::device_resources& handle,
+                    const kmeans_balanced_params& params,
+                    IdxT dim,
+                    const T* dataset,
+                    IdxT n_rows,
+                    IdxT n_clusters,
+                    MathT* cluster_centers,
+                    LabelT* cluster_labels,
+                    CounterT* cluster_sizes,
+                    MappingOpT mapping_op,
+                    rmm::mr::device_memory_resource* device_memory,
+                    const MathT* dataset_norm = nullptr)
+{
+  auto stream = handle.get_stream();
+
+  // "randomly" initialize labels
+  auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(cluster_labels, n_rows);
+  linalg::map_offset(
+    handle,
+    labels_view,
+    raft::compose_op(raft::cast_op<LabelT>(), raft::mod_const_op<IdxT>(n_clusters)));
+
+  // update centers to match the initialized labels.
+  calc_centers_and_sizes(handle,
+                         cluster_centers,
+                         cluster_sizes,
+                         n_clusters,
+                         dim,
+                         dataset,
+                         n_rows,
+                         cluster_labels,
+                         true,
+                         mapping_op,
+                         device_memory);
+
+  // run EM
+  balancing_em_iters(handle,
+                     params,
+                     params.n_iters,
+                     dim,
+                     dataset,
+                     dataset_norm,
+                     n_rows,
+                     n_clusters,
+                     cluster_centers,
+                     cluster_labels,
+                     cluster_sizes,
+                     2,
+                     MathT{0.25},
+                     mapping_op,
+                     device_memory);
+}
+
+/** Calculate how many fine clusters should belong to each mesocluster. */
+template <typename IdxT, typename CounterT>
+inline auto arrange_fine_clusters(IdxT n_clusters,
+                                  IdxT n_mesoclusters,
+                                  IdxT n_rows,
+                                  const CounterT* mesocluster_sizes)
+{
+  std::vector<IdxT> fine_clusters_nums(n_mesoclusters);
+  std::vector<IdxT> fine_clusters_csum(n_mesoclusters + 1);
+  fine_clusters_csum[0] = 0;
+
+  IdxT n_lists_rem       = n_clusters;
+  IdxT n_nonempty_ms_rem = 0;
+  for (IdxT i = 0; i < n_mesoclusters; i++) {
+    n_nonempty_ms_rem += mesocluster_sizes[i] > CounterT{0} ? 1 : 0;
+  }
+  IdxT n_rows_rem               = n_rows;
+  CounterT mesocluster_size_sum = 0;
+  CounterT mesocluster_size_max = 0;
+  IdxT fine_clusters_nums_max   = 0;
+  for (IdxT i = 0; i < n_mesoclusters; i++) {
+    if (i < n_mesoclusters - 1) {
+      // Although the algorithm is meant to produce balanced clusters, when something
+      // goes wrong, we may get empty clusters (e.g. during development/debugging).
+      // The code below ensures a proportional arrangement of fine cluster numbers
+      // per mesocluster, even if some clusters are empty.
+      if (mesocluster_sizes[i] == 0) {
+        fine_clusters_nums[i] = 0;
+      } else {
+        n_nonempty_ms_rem--;
+        auto s = static_cast<IdxT>(
+          static_cast<double>(n_lists_rem * mesocluster_sizes[i]) / n_rows_rem + .5);
+        s                     = std::min<IdxT>(s, n_lists_rem - n_nonempty_ms_rem);
+        fine_clusters_nums[i] = std::max(s, IdxT{1});
+      }
+    } else {
+      fine_clusters_nums[i] = n_lists_rem;
+    }
+    n_lists_rem -= fine_clusters_nums[i];
+    n_rows_rem -= mesocluster_sizes[i];
+    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
+    mesocluster_size_sum += mesocluster_sizes[i];
+    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
+    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
+  }
+
+  RAFT_EXPECTS(static_cast<IdxT>(mesocluster_size_sum) == n_rows,
+               "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)",
+               static_cast<size_t>(mesocluster_size_sum),
+               static_cast<size_t>(n_rows));
+  RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
+               "fine cluster numbers do not add up (%zu) to the total number of clusters (%zu)",
+               static_cast<size_t>(fine_clusters_csum[n_mesoclusters]),
+               static_cast<size_t>(n_clusters));
+
+  return std::make_tuple(static_cast<IdxT>(mesocluster_size_max),
+                         fine_clusters_nums_max,
+                         std::move(fine_clusters_nums),
+                         std::move(fine_clusters_csum));
+}
+
+/**
+ *  Given the (coarse) mesoclusters and the distribution of fine clusters within them,
+ *  build the fine clusters.
+ *
+ *  Processing one mesocluster at a time:
+ *   1. Copy mesocluster data into a separate buffer
+ *   2. Predict fine cluster
+ *   3. Refince the fine cluster centers
+ *
+ *  As a result, the fine clusters are what is returned by `build_hierarchical`;
+ *  this function returns the total number of fine clusters, which can be checked to be
+ *  the same as the requested number of clusters.
+ *
+ *  Note: this function uses at most `fine_clusters_nums_max` points per mesocluster for training;
+ *  if one of the clusters is larger than that (as given by `mesocluster_sizes`), the extra data
+ *  is ignored and a warning is reported.
+ */
+template <typename T,
+          typename MathT,
+          typename IdxT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+auto build_fine_clusters(const raft::device_resources& handle,
+                         const kmeans_balanced_params& params,
+                         IdxT dim,
+                         const T* dataset_mptr,
+                         const MathT* dataset_norm_mptr,
+                         const LabelT* labels_mptr,
+                         IdxT n_rows,
+                         const IdxT* fine_clusters_nums,
+                         const IdxT* fine_clusters_csum,
+                         const CounterT* mesocluster_sizes,
+                         IdxT n_mesoclusters,
+                         IdxT mesocluster_size_max,
+                         IdxT fine_clusters_nums_max,
+                         MathT* cluster_centers,
+                         MappingOpT mapping_op,
+                         rmm::mr::device_memory_resource* managed_memory,
+                         rmm::mr::device_memory_resource* device_memory) -> IdxT
+{
+  auto stream = handle.get_stream();
+  rmm::device_uvector<IdxT> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
+  rmm::device_uvector<MathT> mc_trainset_buf(mesocluster_size_max * dim, stream, device_memory);
+  rmm::device_uvector<MathT> mc_trainset_norm_buf(mesocluster_size_max, stream, device_memory);
+  auto mc_trainset_ids  = mc_trainset_ids_buf.data();
+  auto mc_trainset      = mc_trainset_buf.data();
+  auto mc_trainset_norm = mc_trainset_norm_buf.data();
+
+  // label (cluster ID) of each vector
+  rmm::device_uvector<LabelT> mc_trainset_labels(mesocluster_size_max, stream, device_memory);
+
+  rmm::device_uvector<MathT> mc_trainset_ccenters(
+    fine_clusters_nums_max * dim, stream, device_memory);
+  // number of vectors in each cluster
+  rmm::device_uvector<CounterT> mc_trainset_csizes_tmp(
+    fine_clusters_nums_max, stream, device_memory);
+
+  // Training clusters in each meso-cluster
+  IdxT n_clusters_done = 0;
+  for (IdxT i = 0; i < n_mesoclusters; i++) {
+    IdxT k = 0;
+    for (IdxT j = 0; j < n_rows && k < mesocluster_size_max; j++) {
+      if (labels_mptr[j] == LabelT(i)) { mc_trainset_ids[k++] = j; }
+    }
+    if (k != static_cast<IdxT>(mesocluster_sizes[i]))
+      RAFT_LOG_WARN("Incorrect mesocluster size at %d. %zu vs %zu",
+                    static_cast<int>(i),
+                    static_cast<size_t>(k),
+                    static_cast<size_t>(mesocluster_sizes[i]));
+    if (k == 0) {
+      RAFT_LOG_DEBUG("Empty cluster %d", i);
+      RAFT_EXPECTS(fine_clusters_nums[i] == 0,
+                   "Number of fine clusters must be zero for the empty mesocluster (got %d)",
+                   static_cast<int>(fine_clusters_nums[i]));
+      continue;
+    } else {
+      RAFT_EXPECTS(fine_clusters_nums[i] > 0,
+                   "Number of fine clusters must be non-zero for a non-empty mesocluster");
+    }
+
+    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset_mptr, mapping_op);
+    raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
+    if (params.metric == raft::distance::DistanceType::L2Expanded ||
+        params.metric == raft::distance::DistanceType::L2SqrtExpanded) {
+      thrust::gather(handle.get_thrust_policy(),
+                     mc_trainset_ids,
+                     mc_trainset_ids + k,
+                     dataset_norm_mptr,
+                     mc_trainset_norm);
+    }
+
+    build_clusters(handle,
+                   params,
+                   dim,
+                   mc_trainset,
+                   k,
+                   fine_clusters_nums[i],
+                   mc_trainset_ccenters.data(),
+                   mc_trainset_labels.data(),
+                   mc_trainset_csizes_tmp.data(),
+                   mapping_op,
+                   device_memory,
+                   mc_trainset_norm);
+
+    raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
+               mc_trainset_ccenters.data(),
+               fine_clusters_nums[i] * dim,
+               stream);
+    handle.sync_stream(stream);
+    n_clusters_done += fine_clusters_nums[i];
+  }
+  return n_clusters_done;
+}
+
+/**
+ * @brief Hierarchical balanced k-means
+ *
+ * @tparam T      element type
+ * @tparam MathT  type of the centroids and mapped data
+ * @tparam IdxT   index type
+ * @tparam LabelT label type
+ * @tparam MappingOpT type of the mapping operation
+ *
+ * @param[in] handle The raft handle.
+ * @param[in] params Structure containing the hyper-parameters
+ * @param dim number of columns in `centers` and `dataset`
+ * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
+ * @param n_rows number of rows in the input
+ * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
+ * @param n_cluster
+ * @param metric the distance type
+ * @param mapping_op Mapping operation from T to MathT
+ * @param stream
+ */
+template <typename T, typename MathT, typename IdxT, typename MappingOpT>
+void build_hierarchical(const raft::device_resources& handle,
+                        const kmeans_balanced_params& params,
+                        IdxT dim,
+                        const T* dataset,
+                        IdxT n_rows,
+                        MathT* cluster_centers,
+                        IdxT n_clusters,
+                        MappingOpT mapping_op)
+{
+  auto stream  = handle.get_stream();
+  using LabelT = uint32_t;
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
+
+  IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));
+  RAFT_LOG_DEBUG("build_hierarchical: n_mesoclusters: %u", n_mesoclusters);
+
+  rmm::mr::managed_memory_resource managed_memory;
+  rmm::mr::device_memory_resource* device_memory = handle.get_workspace_resource();
+  auto [max_minibatch_size, mem_per_row] =
+    calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
+  auto pool_guard =
+    raft::get_pool_memory_resource(device_memory, mem_per_row * size_t(max_minibatch_size));
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("build_hierarchical: using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  // Precompute the L2 norm of the dataset if relevant.
+  const MathT* dataset_norm = nullptr;
+  rmm::device_uvector<MathT> dataset_norm_buf(0, stream, device_memory);
+  if (params.metric == raft::distance::DistanceType::L2Expanded ||
+      params.metric == raft::distance::DistanceType::L2SqrtExpanded) {
+    dataset_norm_buf.resize(n_rows, stream);
+    for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
+      IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
+      compute_norm(handle,
+                   dataset_norm_buf.data() + offset,
+                   dataset + dim * offset,
+                   dim,
+                   minibatch_size,
+                   mapping_op,
+                   device_memory);
+    }
+    dataset_norm = (const MathT*)dataset_norm_buf.data();
+  }
+
+  /* Temporary workaround to cub::DeviceHistogram not supporting any type that isn't natively
+   * supported by atomicAdd: find a supported CounterT based on the IdxT. */
+  typedef typename std::conditional_t<sizeof(IdxT) == 8, unsigned long long int, unsigned int>
+    CounterT;
+
+  // build coarse clusters (mesoclusters)
+  rmm::device_uvector<LabelT> mesocluster_labels_buf(n_rows, stream, &managed_memory);
+  rmm::device_uvector<CounterT> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
+  {
+    rmm::device_uvector<MathT> mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory);
+    build_clusters(handle,
+                   params,
+                   dim,
+                   dataset,
+                   n_rows,
+                   n_mesoclusters,
+                   mesocluster_centers_buf.data(),
+                   mesocluster_labels_buf.data(),
+                   mesocluster_sizes_buf.data(),
+                   mapping_op,
+                   device_memory,
+                   dataset_norm);
+  }
+
+  auto mesocluster_sizes  = mesocluster_sizes_buf.data();
+  auto mesocluster_labels = mesocluster_labels_buf.data();
+
+  handle.sync_stream(stream);
+
+  // build fine clusters
+  auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
+    arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes);
+
+  const IdxT mesocluster_size_max_balanced = div_rounding_up_safe<size_t>(
+    2lu * size_t(n_rows), std::max<size_t>(size_t(n_mesoclusters), 1lu));
+  if (mesocluster_size_max > mesocluster_size_max_balanced) {
+    RAFT_LOG_WARN(
+      "build_hierarchical: built unbalanced mesoclusters (max_mesocluster_size == %u > %u). "
+      "At most %u points will be used for training within each mesocluster. "
+      "Consider increasing the number of training iterations `n_iters`.",
+      mesocluster_size_max,
+      mesocluster_size_max_balanced,
+      mesocluster_size_max_balanced);
+    RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters);
+    RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters);
+    mesocluster_size_max = mesocluster_size_max_balanced;
+  }
+
+  auto n_clusters_done = build_fine_clusters(handle,
+                                             params,
+                                             dim,
+                                             dataset,
+                                             dataset_norm,
+                                             mesocluster_labels,
+                                             n_rows,
+                                             fine_clusters_nums.data(),
+                                             fine_clusters_csum.data(),
+                                             mesocluster_sizes,
+                                             n_mesoclusters,
+                                             mesocluster_size_max,
+                                             fine_clusters_nums_max,
+                                             cluster_centers,
+                                             mapping_op,
+                                             &managed_memory,
+                                             device_memory);
+  RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
+
+  rmm::device_uvector<CounterT> cluster_sizes(n_clusters, stream, device_memory);
+  rmm::device_uvector<LabelT> labels(n_rows, stream, device_memory);
+
+  // Fine-tuning k-means for all clusters
+  //
+  // (*) Since the likely cluster centroids have been calculated hierarchically already, the number
+  // of iterations for fine-tuning kmeans for whole clusters should be reduced. However, there is a
+  // possibility that the clusters could be unbalanced here, in which case the actual number of
+  // iterations would be increased.
+  //
+  balancing_em_iters(handle,
+                     params,
+                     std::max<uint32_t>(params.n_iters / 10, 2),
+                     dim,
+                     dataset,
+                     dataset_norm,
+                     n_rows,
+                     n_clusters,
+                     cluster_centers,
+                     labels.data(),
+                     cluster_sizes.data(),
+                     5,
+                     MathT{0.2},
+                     mapping_op,
+                     device_memory);
+}
+
+}  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh
index 2fd33ac759..76fc22e99e 100644
--- a/cpp/include/raft/cluster/detail/kmeans_common.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 #include <raft/cluster/kmeans_types.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
@@ -88,7 +88,7 @@ struct KeyValueIndexOp {
 
 // Computes the intensity histogram from a sequence of labels
 template <typename SampleIteratorT, typename CounterT, typename IndexT>
-void countLabels(const raft::handle_t& handle,
+void countLabels(raft::device_resources const& handle,
                  SampleIteratorT labels,
                  CounterT* count,
                  IndexT n_samples,
@@ -96,9 +96,13 @@ void countLabels(const raft::handle_t& handle,
                  rmm::device_uvector<char>& workspace)
 {
   cudaStream_t stream = handle.get_stream();
-  IndexT num_levels   = n_clusters + 1;
-  IndexT lower_level  = 0;
-  IndexT upper_level  = n_clusters;
+
+  // CUB::DeviceHistogram requires a signed index type
+  typedef typename std::make_signed_t<IndexT> CubIndexT;
+
+  CubIndexT num_levels  = n_clusters + 1;
+  CubIndexT lower_level = 0;
+  CubIndexT upper_level = n_clusters;
 
   size_t temp_storage_bytes = 0;
   RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
@@ -108,7 +112,7 @@ void countLabels(const raft::handle_t& handle,
                                                     num_levels,
                                                     lower_level,
                                                     upper_level,
-                                                    n_samples,
+                                                    static_cast<CubIndexT>(n_samples),
                                                     stream));
 
   workspace.resize(temp_storage_bytes, stream);
@@ -120,12 +124,12 @@ void countLabels(const raft::handle_t& handle,
                                                     num_levels,
                                                     lower_level,
                                                     upper_level,
-                                                    n_samples,
+                                                    static_cast<CubIndexT>(n_samples),
                                                     stream));
 }
 
 template <typename DataT, typename IndexT>
-void checkWeight(const raft::handle_t& handle,
+void checkWeight(raft::device_resources const& handle,
                  raft::device_vector_view<DataT, IndexT> weight,
                  rmm::device_uvector<char>& workspace)
 {
@@ -183,7 +187,7 @@ template <typename InputT,
           typename MainOpT,
           typename ReductionOpT,
           typename IndexT = int>
-void computeClusterCost(const raft::handle_t& handle,
+void computeClusterCost(raft::device_resources const& handle,
                         raft::device_vector_view<InputT, IndexT> minClusterDistance,
                         rmm::device_uvector<char>& workspace,
                         raft::device_scalar_view<OutputT> clusterCost,
@@ -218,7 +222,7 @@ void computeClusterCost(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT>
-void sampleCentroids(const raft::handle_t& handle,
+void sampleCentroids(raft::device_resources const& handle,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<DataT, IndexT> minClusterDistance,
                      raft::device_vector_view<uint8_t, IndexT> isSampleCentroid,
@@ -282,7 +286,7 @@ void sampleCentroids(const raft::handle_t& handle,
 // calculate pairwise distance between 'dataset[n x d]' and 'centroids[k x d]',
 // result will be stored in 'pairwiseDistance[n x k]'
 template <typename DataT, typename IndexT>
-void pairwise_distance_kmeans(const raft::handle_t& handle,
+void pairwise_distance_kmeans(raft::device_resources const& handle,
                               raft::device_matrix_view<const DataT, IndexT> X,
                               raft::device_matrix_view<const DataT, IndexT> centroids,
                               raft::device_matrix_view<DataT, IndexT> pairwiseDistance,
@@ -310,7 +314,7 @@ void pairwise_distance_kmeans(const raft::handle_t& handle,
 // shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores
 // in 'out' does not modify the input
 template <typename DataT, typename IndexT>
-void shuffleAndGather(const raft::handle_t& handle,
+void shuffleAndGather(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT> in,
                       raft::device_matrix_view<DataT, IndexT> out,
                       uint32_t n_samples_to_gather,
@@ -335,7 +339,7 @@ void shuffleAndGather(const raft::handle_t& handle,
                        in.extent(1),
                        in.extent(0),
                        indices.data_handle(),
-                       n_samples_to_gather,
+                       static_cast<IndexT>(n_samples_to_gather),
                        out.data_handle(),
                        stream);
 }
@@ -345,7 +349,7 @@ void shuffleAndGather(const raft::handle_t& handle,
 // is the distance between the sample and the 'centroid[key]'
 template <typename DataT, typename IndexT>
 void minClusterAndDistanceCompute(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
   raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT> minClusterAndDistance,
@@ -478,7 +482,7 @@ void minClusterAndDistanceCompute(
 }
 
 template <typename DataT, typename IndexT>
-void minClusterDistanceCompute(const raft::handle_t& handle,
+void minClusterDistanceCompute(raft::device_resources const& handle,
                                raft::device_matrix_view<const DataT, IndexT> X,
                                raft::device_matrix_view<DataT, IndexT> centroids,
                                raft::device_vector_view<DataT, IndexT> minClusterDistance,
@@ -596,7 +600,7 @@ void minClusterDistanceCompute(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT>
-void countSamplesInCluster(const raft::handle_t& handle,
+void countSamplesInCluster(raft::device_resources const& handle,
                            const KMeansParams& params,
                            raft::device_matrix_view<const DataT, IndexT> X,
                            raft::device_vector_view<const DataT, IndexT> L2NormX,
diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index 2746b6f657..a9d8777304 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/detail/warn_dbg.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
@@ -360,7 +360,7 @@ static __global__ void divideCentroids(index_type_t d,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int chooseNewCentroid(handle_t const& handle,
+static int chooseNewCentroid(raft::device_resources const& handle,
                              index_type_t n,
                              index_type_t d,
                              value_type_t rand,
@@ -457,7 +457,7 @@ static int chooseNewCentroid(handle_t const& handle,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int initializeCentroids(handle_t const& handle,
+static int initializeCentroids(raft::device_resources const& handle,
                                index_type_t n,
                                index_type_t d,
                                index_type_t k,
@@ -568,7 +568,7 @@ static int initializeCentroids(handle_t const& handle,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int assignCentroids(handle_t const& handle,
+static int assignCentroids(raft::device_resources const& handle,
                            index_type_t n,
                            index_type_t d,
                            index_type_t k,
@@ -640,7 +640,7 @@ static int assignCentroids(handle_t const& handle,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int updateCentroids(handle_t const& handle,
+static int updateCentroids(raft::device_resources const& handle,
                            index_type_t n,
                            index_type_t d,
                            index_type_t k,
@@ -783,7 +783,7 @@ static int updateCentroids(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
+int kmeans(raft::device_resources const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
@@ -950,7 +950,7 @@ int kmeans(handle_t const& handle,
  *  @return error flag
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
+int kmeans(raft::device_resources const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index 8143d21641..46e31b672e 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ void merge_msts(sparse::solver::Graph_COO<value_idx, value_idx, value_t>& coo1,
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_knn_graph(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const value_t* X,
   sparse::solver::Graph_COO<value_idx, value_idx, value_t>& msf,
   size_t m,
@@ -130,7 +130,7 @@ void connect_knn_graph(
  */
 template <typename value_idx, typename value_t, typename red_op>
 void build_sorted_mst(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const value_t* X,
   const value_idx* indptr,
   const value_idx* indices,
diff --git a/cpp/include/raft/cluster/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh
index d12db85e1b..473d858827 100644
--- a/cpp/include/raft/cluster/detail/single_linkage.cuh
+++ b/cpp/include/raft/cluster/detail/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ static const size_t EMPTY = 0;
  * @param[in] n_clusters number of clusters to assign data samples
  */
 template <typename value_idx, typename value_t, LinkageDistance dist_type>
-void single_linkage(const raft::handle_t& handle,
+void single_linkage(raft::device_resources const& handle,
                     const value_t* X,
                     size_t m,
                     size_t n,
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index 4b912dc966..ac9e66d5da 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,12 +44,12 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  *   k-means++ algorithm.
  *
  * @code{.cpp}
- *   #include <raft/core/handle.hpp>
+ *   #include <raft/core/device_resources.hpp>
  *   #include <raft/cluster/kmeans.cuh>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::handle_t handle;
+ *   raft::raft::device_resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -83,7 +83,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit(handle_t const& handle,
+void fit(raft::device_resources const& handle,
          const KMeansParams& params,
          raft::device_matrix_view<const DataT, IndexT> X,
          std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -98,12 +98,12 @@ void fit(handle_t const& handle,
  * @brief Predict the closest cluster each sample in X belongs to.
  *
  * @code{.cpp}
- *   #include <raft/core/handle.hpp>
+ *   #include <raft/core/device_resources.hpp>
  *   #include <raft/cluster/kmeans.cuh>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::handle_t handle;
+ *   raft::raft::device_resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -147,7 +147,7 @@ void fit(handle_t const& handle,
  *                                 their closest cluster center.
  */
 template <typename DataT, typename IndexT>
-void predict(handle_t const& handle,
+void predict(raft::device_resources const& handle,
              const KMeansParams& params,
              raft::device_matrix_view<const DataT, IndexT> X,
              std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -165,12 +165,12 @@ void predict(handle_t const& handle,
  * in the input.
  *
  * @code{.cpp}
- *   #include <raft/core/handle.hpp>
+ *   #include <raft/core/device_resources.hpp>
  *   #include <raft/cluster/kmeans.cuh>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::handle_t handle;
+ *   raft::raft::device_resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -210,7 +210,7 @@ void predict(handle_t const& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit_predict(handle_t const& handle,
+void fit_predict(raft::device_resources const& handle,
                  const KMeansParams& params,
                  raft::device_matrix_view<const DataT, IndexT> X,
                  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -239,7 +239,7 @@ void fit_predict(handle_t const& handle,
  *                              [dim = n_samples x n_features]
  */
 template <typename DataT, typename IndexT>
-void transform(const raft::handle_t& handle,
+void transform(raft::device_resources const& handle,
                const KMeansParams& params,
                raft::device_matrix_view<const DataT, IndexT> X,
                raft::device_matrix_view<const DataT, IndexT> centroids,
@@ -249,7 +249,7 @@ void transform(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT>
-void transform(const raft::handle_t& handle,
+void transform(raft::device_resources const& handle,
                const KMeansParams& params,
                const DataT* X,
                const DataT* centroids,
@@ -281,7 +281,7 @@ void transform(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void sample_centroids(const raft::handle_t& handle,
+void sample_centroids(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT> X,
                       raft::device_vector_view<DataT, IndexT> minClusterDistance,
                       raft::device_vector_view<std::uint8_t, IndexT> isSampleCentroid,
@@ -308,7 +308,7 @@ void sample_centroids(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT, typename ReductionOpT>
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   raft::device_vector_view<DataT, IndexT> minClusterDistance,
                   rmm::device_uvector<char>& workspace,
                   raft::device_scalar_view<DataT> clusterCost,
@@ -334,7 +334,7 @@ void cluster_cost(const raft::handle_t& handle,
  * @param[out] new_centroids: output matrix of updated centroids (size n_clusters, n_features)
  */
 template <typename DataT, typename IndexT, typename LabelsIterator>
-void update_centroids(const raft::handle_t& handle,
+void update_centroids(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT, row_major> X,
                       raft::device_vector_view<const DataT, IndexT> sample_weights,
                       raft::device_matrix_view<const DataT, IndexT, row_major> centroids,
@@ -375,7 +375,7 @@ void update_centroids(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void min_cluster_distance(const raft::handle_t& handle,
+void min_cluster_distance(raft::device_resources const& handle,
                           raft::device_matrix_view<const DataT, IndexT> X,
                           raft::device_matrix_view<DataT, IndexT> centroids,
                           raft::device_vector_view<DataT, IndexT> minClusterDistance,
@@ -426,7 +426,7 @@ void min_cluster_distance(const raft::handle_t& handle,
  */
 template <typename DataT, typename IndexT>
 void min_cluster_and_distance(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
   raft::device_vector_view<raft::KeyValuePair<IndexT, DataT>, IndexT> minClusterAndDistance,
@@ -466,7 +466,7 @@ void min_cluster_and_distance(
  *
  */
 template <typename DataT, typename IndexT>
-void shuffle_and_gather(const raft::handle_t& handle,
+void shuffle_and_gather(raft::device_resources const& handle,
                         raft::device_matrix_view<const DataT, IndexT> in,
                         raft::device_matrix_view<DataT, IndexT> out,
                         uint32_t n_samples_to_gather,
@@ -495,7 +495,7 @@ void shuffle_and_gather(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void count_samples_in_cluster(const raft::handle_t& handle,
+void count_samples_in_cluster(raft::device_resources const& handle,
                               const KMeansParams& params,
                               raft::device_matrix_view<const DataT, IndexT> X,
                               raft::device_vector_view<DataT, IndexT> L2NormX,
@@ -525,7 +525,7 @@ void count_samples_in_cluster(const raft::handle_t& handle,
  * @param[in]  workspace             Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void init_plus_plus(const raft::handle_t& handle,
+void init_plus_plus(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     raft::device_matrix_view<DataT, IndexT> centroids,
@@ -558,7 +558,7 @@ void init_plus_plus(const raft::handle_t& handle,
  * @param[in]     workspace     Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void fit_main(const raft::handle_t& handle,
+void fit_main(raft::device_resources const& handle,
               const KMeansParams& params,
               raft::device_matrix_view<const DataT, IndexT> X,
               raft::device_vector_view<const DataT, IndexT> sample_weights,
@@ -605,7 +605,7 @@ namespace raft::cluster {
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -617,7 +617,7 @@ void kmeans_fit(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit(handle_t const& handle,
+void kmeans_fit(raft::device_resources const& handle,
                 const KMeansParams& params,
                 const DataT* X,
                 const DataT* sample_weight,
@@ -652,7 +652,7 @@ void kmeans_fit(handle_t const& handle,
  *                                 their closest cluster center.
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -666,7 +666,7 @@ void kmeans_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_predict(handle_t const& handle,
+void kmeans_predict(raft::device_resources const& handle,
                     const KMeansParams& params,
                     const DataT* X,
                     const DataT* sample_weight,
@@ -717,7 +717,7 @@ void kmeans_predict(handle_t const& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         raft::device_matrix_view<const DataT, IndexT> X,
                         std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
@@ -731,7 +731,7 @@ void kmeans_fit_predict(handle_t const& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_fit_predict(handle_t const& handle,
+void kmeans_fit_predict(raft::device_resources const& handle,
                         const KMeansParams& params,
                         const DataT* X,
                         const DataT* sample_weight,
@@ -762,7 +762,7 @@ void kmeans_fit_predict(handle_t const& handle,
  *                              [dim = n_samples x n_features]
  */
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       raft::device_matrix_view<const DataT, IndexT> X,
                       raft::device_matrix_view<const DataT, IndexT> centroids,
@@ -772,7 +772,7 @@ void kmeans_transform(const raft::handle_t& handle,
 }
 
 template <typename DataT, typename IndexT = int>
-void kmeans_transform(const raft::handle_t& handle,
+void kmeans_transform(raft::device_resources const& handle,
                       const KMeansParams& params,
                       const DataT* X,
                       const DataT* centroids,
@@ -809,7 +809,7 @@ using KeyValueIndexOp = kmeans::KeyValueIndexOp<IndexT, DataT>;
  *
  */
 template <typename DataT, typename IndexT>
-void sampleCentroids(const raft::handle_t& handle,
+void sampleCentroids(raft::device_resources const& handle,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<DataT, IndexT> minClusterDistance,
                      raft::device_vector_view<std::uint8_t, IndexT> isSampleCentroid,
@@ -836,7 +836,7 @@ void sampleCentroids(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT, typename ReductionOpT>
-void computeClusterCost(const raft::handle_t& handle,
+void computeClusterCost(raft::device_resources const& handle,
                         raft::device_vector_view<DataT, IndexT> minClusterDistance,
                         rmm::device_uvector<char>& workspace,
                         raft::device_scalar_view<DataT> clusterCost,
@@ -867,7 +867,7 @@ void computeClusterCost(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void minClusterDistanceCompute(const raft::handle_t& handle,
+void minClusterDistanceCompute(raft::device_resources const& handle,
                                const KMeansParams& params,
                                raft::device_matrix_view<const DataT, IndexT> X,
                                raft::device_matrix_view<DataT, IndexT> centroids,
@@ -914,7 +914,7 @@ void minClusterDistanceCompute(const raft::handle_t& handle,
  */
 template <typename DataT, typename IndexT>
 void minClusterAndDistanceCompute(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const KMeansParams& params,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
@@ -952,7 +952,7 @@ void minClusterAndDistanceCompute(
  *
  */
 template <typename DataT, typename IndexT>
-void shuffleAndGather(const raft::handle_t& handle,
+void shuffleAndGather(raft::device_resources const& handle,
                       raft::device_matrix_view<const DataT, IndexT> in,
                       raft::device_matrix_view<DataT, IndexT> out,
                       uint32_t n_samples_to_gather,
@@ -981,7 +981,7 @@ void shuffleAndGather(const raft::handle_t& handle,
  *
  */
 template <typename DataT, typename IndexT>
-void countSamplesInCluster(const raft::handle_t& handle,
+void countSamplesInCluster(raft::device_resources const& handle,
                            const KMeansParams& params,
                            raft::device_matrix_view<const DataT, IndexT> X,
                            raft::device_vector_view<DataT, IndexT> L2NormX,
@@ -1012,7 +1012,7 @@ void countSamplesInCluster(const raft::handle_t& handle,
  * @param[in]  workspace             Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void kmeansPlusPlus(const raft::handle_t& handle,
+void kmeansPlusPlus(raft::device_resources const& handle,
                     const KMeansParams& params,
                     raft::device_matrix_view<const DataT, IndexT> X,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
@@ -1045,7 +1045,7 @@ void kmeansPlusPlus(const raft::handle_t& handle,
  * @param[in]     workspace     Temporary workspace buffer which can get resized
  */
 template <typename DataT, typename IndexT>
-void kmeans_fit_main(const raft::handle_t& handle,
+void kmeans_fit_main(raft::device_resources const& handle,
                      const KMeansParams& params,
                      raft::device_matrix_view<const DataT, IndexT> X,
                      raft::device_vector_view<const DataT, IndexT> weight,
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
new file mode 100644
index 0000000000..405c7a8018
--- /dev/null
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <utility>
+
+#include <raft/cluster/detail/kmeans_balanced.cuh>
+#include <raft/core/mdarray.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::cluster::kmeans_balanced {
+
+/**
+ * @brief Find clusters of balanced sizes with a hierarchical k-means algorithm.
+ *
+ * This variant of the k-means algorithm first clusters the dataset in mesoclusters, then clusters
+ * the subsets associated to each mesocluster into fine clusters, and finally runs a few k-means
+ * iterations over the whole dataset and with all the centroids to obtain the final clusters.
+ *
+ * Each k-means iteration applies expectation-maximization-balancing:
+ *  - Balancing: adjust centers for clusters that have a small number of entries. If the size of a
+ *    cluster is below a threshold, the center is moved towards a bigger cluster.
+ *  - Expectation: predict the labels (i.e find closest cluster centroid to each point)
+ *  - Maximization: calculate optimal centroids (i.e find the center of gravity of each cluster)
+ *
+ * The number of mesoclusters is chosen by rounding the square root of the number of clusters. E.g
+ * for 512 clusters, we would have 23 mesoclusters. The number of fine clusters per mesocluster is
+ * chosen proportionally to the number of points in each mesocluster.
+ *
+ * This variant of k-means uses random initialization and a fixed number of iterations, though
+ * iterations can be repeated if the balancing step moved the centroids.
+ *
+ * Additionally, this algorithm supports quantized datasets in arbitrary types but the core part of
+ * the algorithm will work with a floating-point type, hence a conversion function can be provided
+ * to map the data type to the math type.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   raft::cluster::kmeans_balanced::fit(handle, params, X, centroids.view());
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle     The raft resources
+ * @param[in]  params     Structure containing the hyper-parameters
+ * @param[in]  X          Training instances to cluster. The data must be in row-major format.
+ *                        [dim = n_samples x n_features]
+ * @param[out] centroids  The generated centroids [dim = n_clusters x n_features]
+ * @param[in]  mapping_op (optional) Functor to convert from the input datatype to the arithmetic
+ *                        datatype. If DataT == MathT, this must be the identity.
+ */
+template <typename DataT, typename MathT, typename IndexT, typename MappingOpT = raft::identity_op>
+void fit(const raft::device_resources& handle,
+         kmeans_balanced_params const& params,
+         raft::device_matrix_view<const DataT, IndexT> X,
+         raft::device_matrix_view<MathT, IndexT> centroids,
+         MappingOpT mapping_op = raft::identity_op())
+{
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(static_cast<uint64_t>(X.extent(0)) * static_cast<uint64_t>(X.extent(1)) <=
+                 static_cast<uint64_t>(std::numeric_limits<IndexT>::max()),
+               "The chosen index type cannot represent all indices for the given dataset");
+  RAFT_EXPECTS(centroids.extent(0) > IndexT{0} && centroids.extent(0) <= X.extent(0),
+               "The number of centroids must be strictly positive and cannot exceed the number of "
+               "points in the training dataset.");
+
+  detail::build_hierarchical(handle,
+                             params,
+                             X.extent(1),
+                             X.data_handle(),
+                             X.extent(0),
+                             centroids.data_handle(),
+                             centroids.extent(0),
+                             mapping_op);
+}
+
+/**
+ * @brief Predict the closest cluster each sample in X belongs to.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto labels = raft::make_device_vector<float, int>(handle, n_rows);
+ *   raft::cluster::kmeans_balanced::predict(handle, params, X, centroids, labels);
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle     The raft resources
+ * @param[in]  params     Structure containing the hyper-parameters
+ * @param[in]  X          Dataset for which to infer the closest clusters.
+ *                        [dim = n_samples x n_features]
+ * @param[in]  centroids  The input centroids [dim = n_clusters x n_features]
+ * @param[out] labels     The output labels [dim = n_samples]
+ * @param[in]  mapping_op (optional) Functor to convert from the input datatype to the arithmetic
+ *                        datatype. If DataT == MathT, this must be the identity.
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename MappingOpT = raft::identity_op>
+void predict(const raft::device_resources& handle,
+             kmeans_balanced_params const& params,
+             raft::device_matrix_view<const DataT, IndexT> X,
+             raft::device_matrix_view<const MathT, IndexT> centroids,
+             raft::device_vector_view<LabelT, IndexT> labels,
+             MappingOpT mapping_op = raft::identity_op())
+{
+  RAFT_EXPECTS(X.extent(0) == labels.extent(0),
+               "Number of rows in dataset and labels are different");
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(static_cast<uint64_t>(X.extent(0)) * static_cast<uint64_t>(X.extent(1)) <=
+                 static_cast<uint64_t>(std::numeric_limits<IndexT>::max()),
+               "The chosen index type cannot represent all indices for the given dataset");
+  RAFT_EXPECTS(static_cast<uint64_t>(centroids.extent(0)) <=
+                 static_cast<uint64_t>(std::numeric_limits<LabelT>::max()),
+               "The chosen label type cannot represent all cluster labels");
+
+  detail::predict(handle,
+                  params,
+                  centroids.data_handle(),
+                  centroids.extent(0),
+                  X.extent(1),
+                  X.data_handle(),
+                  X.extent(0),
+                  labels.data_handle(),
+                  mapping_op);
+}
+
+/**
+ * @brief Compute hierarchical balanced k-means clustering and predict cluster index for each sample
+ * in the input.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   auto labels = raft::make_device_vector<float, int>(handle, n_rows);
+ *   raft::cluster::kmeans_balanced::fit_predict(
+ *       handle, params, X, centroids.view(), labels.view());
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle     The raft resources
+ * @param[in]  params     Structure containing the hyper-parameters
+ * @param[in]  X          Training instances to cluster. The data must be in row-major format.
+ *                        [dim = n_samples x n_features]
+ * @param[out] centroids  The output centroids [dim = n_clusters x n_features]
+ * @param[out] labels     The output labels [dim = n_samples]
+ * @param[in]  mapping_op (optional) Functor to convert from the input datatype to the arithmetic
+ *                        datatype. If DataT and MathT are the same, this must be the identity.
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename MappingOpT = raft::identity_op>
+void fit_predict(const raft::device_resources& handle,
+                 kmeans_balanced_params const& params,
+                 raft::device_matrix_view<const DataT, IndexT> X,
+                 raft::device_matrix_view<MathT, IndexT> centroids,
+                 raft::device_vector_view<LabelT, IndexT> labels,
+                 MappingOpT mapping_op = raft::identity_op())
+{
+  auto centroids_const = raft::make_device_matrix_view<const MathT, IndexT>(
+    centroids.data_handle(), centroids.extent(0), centroids.extent(1));
+  raft::cluster::kmeans_balanced::fit(handle, params, X, centroids, mapping_op);
+  raft::cluster::kmeans_balanced::predict(handle, params, X, centroids_const, labels, mapping_op);
+}
+
+namespace helpers {
+
+/**
+ * @brief Randomly initialize centers and apply expectation-maximization-balancing iterations
+ *
+ * This is essentially the non-hierarchical balanced k-means algorithm which is used by the
+ * hierarchical algorithm once to build the mesoclusters and once per mesocluster to build the fine
+ * clusters.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   #include <raft/cluster/kmeans_balanced_types.hpp>
+ *   ...
+ *   raft::handle_t handle;
+ *   raft::cluster::kmeans_balanced_params params;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   auto labels = raft::make_device_vector<int, int>(handle, n_samples);
+ *   auto sizes = raft::make_device_vector<int, int>(handle, n_clusters);
+ *   raft::cluster::kmeans_balanced::build_clusters(
+ *       handle, params, X, centroids.view(), labels.view(), sizes.view());
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam CounterT Counter type supported by CUDA's native atomicAdd.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle        The raft resources
+ * @param[in]  params        Structure containing the hyper-parameters
+ * @param[in]  X             Training instances to cluster. The data must be in row-major format.
+ *                           [dim = n_samples x n_features]
+ * @param[out] centroids     The output centroids [dim = n_clusters x n_features]
+ * @param[out] labels        The output labels [dim = n_samples]
+ * @param[out] cluster_sizes Size of each cluster [dim = n_clusters]
+ * @param[in]  mapping_op    (optional) Functor to convert from the input datatype to the
+ *                           arithmetic datatype. If DataT == MathT, this must be the identity.
+ * @param[in]  X_norm        (optional) Dataset's row norms [dim = n_samples]
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT>
+void build_clusters(const raft::device_resources& handle,
+                    const kmeans_balanced_params& params,
+                    raft::device_matrix_view<const DataT, IndexT> X,
+                    raft::device_matrix_view<MathT, IndexT> centroids,
+                    raft::device_vector_view<LabelT, IndexT> labels,
+                    raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+                    MappingOpT mapping_op = raft::identity_op(),
+                    std::optional<raft::device_vector_view<const MathT>> X_norm = std::nullopt)
+{
+  RAFT_EXPECTS(X.extent(0) == labels.extent(0),
+               "Number of rows in dataset and labels are different");
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0),
+               "Number of rows in centroids and clusyer_sizes are different");
+
+  detail::build_clusters(handle,
+                         params,
+                         X.extent(1),
+                         X.data_handle(),
+                         X.extent(0),
+                         centroids.extent(0),
+                         centroids.data_handle(),
+                         labels.data_handle(),
+                         cluster_sizes.data_handle(),
+                         mapping_op,
+                         handle.get_workspace_resource(),
+                         X_norm.has_value() ? X_norm.value().data_handle() : nullptr);
+}
+
+/**
+ * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
+ *
+ * Let `S_i = {x_k | x_k \in X & labels[k] == i}` be the vectors in the dataset with label i.
+ *
+ * On exit,
+ *   `centers_i = (\sum_{x \in S_i} x + w_i * center_i) / (|S_i| + w_i)`,
+ *     where  `w_i = reset_counters ?  0 : cluster_size[i]`.
+ *
+ * In other words, the updated cluster centers are a weighted average of the existing cluster
+ * center, and the coordinates of the points labeled with i. _This allows calling this function
+ * multiple times with different datasets with the same effect as if calling this function once
+ * on the combined dataset_.
+ *
+ * @code{.cpp}
+ *   #include <raft/core/handle.hpp>
+ *   #include <raft/cluster/kmeans_balanced.cuh>
+ *   ...
+ *   raft::handle_t handle;
+ *   auto centroids = raft::make_device_matrix<float, int>(handle, n_clusters, n_features);
+ *   auto sizes = raft::make_device_vector<int, int>(handle, n_clusters);
+ *   raft::cluster::kmeans_balanced::calc_centers_and_sizes(
+ *       handle, X, labels, centroids.view(), sizes.view(), true);
+ * @endcode
+ *
+ * @tparam DataT Type of the input data.
+ * @tparam MathT Type of the centroids and mapped data.
+ * @tparam IndexT Type used for indexing.
+ * @tparam LabelT Type of the output labels.
+ * @tparam CounterT Counter type supported by CUDA's native atomicAdd.
+ * @tparam MappingOpT Type of the mapping function.
+ * @param[in]  handle         The raft resources
+ * @param[in]  X              Dataset for which to calculate cluster centers. The data must be in
+ *                            row-major format. [dim = n_samples x n_features]
+ * @param[in]  labels         The input labels [dim = n_samples]
+ * @param[out] centroids      The output centroids [dim = n_clusters x n_features]
+ * @param[out] cluster_sizes  Size of each cluster [dim = n_clusters]
+ * @param[in]  reset_counters Whether to clear the output arrays before calculating.
+ *                            When set to `false`, this function may be used to update existing
+ *                            centers and sizes using the weighted average principle.
+ * @param[in]  mapping_op     (optional) Functor to convert from the input datatype to the
+ *                            arithmetic datatype. If DataT == MathT, this must be the identity.
+ */
+template <typename DataT,
+          typename MathT,
+          typename IndexT,
+          typename LabelT,
+          typename CounterT,
+          typename MappingOpT = raft::identity_op>
+void calc_centers_and_sizes(const raft::device_resources& handle,
+                            raft::device_matrix_view<const DataT, IndexT> X,
+                            raft::device_vector_view<const LabelT, IndexT> labels,
+                            raft::device_matrix_view<MathT, IndexT> centroids,
+                            raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+                            bool reset_counters   = true,
+                            MappingOpT mapping_op = raft::identity_op())
+{
+  RAFT_EXPECTS(X.extent(0) == labels.extent(0),
+               "Number of rows in dataset and labels are different");
+  RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
+               "Number of features in dataset and centroids are different");
+  RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0),
+               "Number of rows in centroids and clusyer_sizes are different");
+
+  detail::calc_centers_and_sizes(handle,
+                                 centroids.data_handle(),
+                                 cluster_sizes.data_handle(),
+                                 centroids.extent(0),
+                                 X.extent(1),
+                                 X.data_handle(),
+                                 X.extent(0),
+                                 labels.data_handle(),
+                                 reset_counters,
+                                 mapping_op);
+}
+
+}  // namespace helpers
+
+}  // namespace raft::cluster::kmeans_balanced
diff --git a/cpp/include/raft/cluster/kmeans_balanced_types.hpp b/cpp/include/raft/cluster/kmeans_balanced_types.hpp
new file mode 100644
index 0000000000..11b77e288a
--- /dev/null
+++ b/cpp/include/raft/cluster/kmeans_balanced_types.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cluster/kmeans_types.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/random/rng_state.hpp>
+
+namespace raft::cluster::kmeans_balanced {
+
+/**
+ * Simple object to specify hyper-parameters to the balanced k-means algorithm.
+ *
+ * The following metrics are currently supported in k-means balanced:
+ *  - InnerProduct
+ *  - L2Expanded
+ *  - L2SqrtExpanded
+ */
+struct kmeans_balanced_params : kmeans_base_params {
+  /**
+   * Number of training iterations
+   */
+  uint32_t n_iters = 20;
+};
+
+}  // namespace raft::cluster::kmeans_balanced
+
+namespace raft::cluster {
+
+using kmeans_balanced::kmeans_balanced_params;
+
+}  // namespace raft::cluster
diff --git a/cpp/include/raft/cluster/kmeans_deprecated.cuh b/cpp/include/raft/cluster/kmeans_deprecated.cuh
index a4cac4cb0f..8e0861ada1 100644
--- a/cpp/include/raft/cluster/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/kmeans_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ namespace cluster {
  *  @return error flag
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle,
+int kmeans(raft::device_resources const& handle,
            index_type_t n,
            index_type_t d,
            index_type_t k,
diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp
index b34f3320ad..4d956ad7a0 100644
--- a/cpp/include/raft/cluster/kmeans_types.hpp
+++ b/cpp/include/raft/cluster/kmeans_types.hpp
@@ -18,12 +18,24 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng_state.hpp>
 
+namespace raft::cluster {
+
+/** Base structure for parameters that are common to all k-means algorithms */
+struct kmeans_base_params {
+  /**
+   * Metric to use for distance computation. The supported metrics can vary per algorithm.
+   */
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
+};
+
+}  // namespace raft::cluster
+
 namespace raft::cluster::kmeans {
 
 /**
  * Simple object to specify hyper-parameters to the kmeans algorithm.
  */
-struct KMeansParams {
+struct KMeansParams : kmeans_base_params {
   enum InitMethod {
 
     /**
@@ -77,11 +89,6 @@ struct KMeansParams {
    */
   raft::random::RngState rng_state{0};
 
-  /**
-   * Metric to use for distance computation.
-   */
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
-
   /**
    * Number of instance k-means algorithm will be run with different seeds.
    */
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index 2d74c364b2..91241b853b 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ namespace raft::cluster {
 template <typename value_idx,
           typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
+void single_linkage(raft::device_resources const& handle,
                     const value_t* X,
                     size_t m,
                     size_t n,
@@ -87,7 +87,7 @@ constexpr int DEFAULT_CONST_C = 15;
  control of k. The algorithm will set `k = log(n) + c`
  */
 template <typename value_t, typename idx_t, LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
+void single_linkage(raft::device_resources const& handle,
                     raft::device_matrix_view<const value_t, idx_t, row_major> X,
                     raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
                     raft::device_vector_view<idx_t, idx_t> labels,
diff --git a/cpp/include/raft/comms/comms_test.hpp b/cpp/include/raft/comms/comms_test.hpp
index c7e5dd3ab6..c61bb32f79 100644
--- a/cpp/include/raft/comms/comms_test.hpp
+++ b/cpp/include/raft/comms/comms_test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/test.hpp>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace comms {
@@ -31,7 +31,7 @@ namespace comms {
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allreduce(const handle_t& handle, int root)
+bool test_collective_allreduce(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_allreduce(handle, root);
 }
@@ -43,7 +43,7 @@ bool test_collective_allreduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_broadcast(const handle_t& handle, int root)
+bool test_collective_broadcast(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_broadcast(handle, root);
 }
@@ -55,7 +55,7 @@ bool test_collective_broadcast(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reduce(const handle_t& handle, int root)
+bool test_collective_reduce(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_reduce(handle, root);
 }
@@ -67,7 +67,7 @@ bool test_collective_reduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allgather(const handle_t& handle, int root)
+bool test_collective_allgather(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_allgather(handle, root);
 }
@@ -79,7 +79,7 @@ bool test_collective_allgather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gather(const handle_t& handle, int root)
+bool test_collective_gather(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_gather(handle, root);
 }
@@ -91,7 +91,7 @@ bool test_collective_gather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gatherv(const handle_t& handle, int root)
+bool test_collective_gatherv(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_gatherv(handle, root);
 }
@@ -103,7 +103,7 @@ bool test_collective_gatherv(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reducescatter(const handle_t& handle, int root)
+bool test_collective_reducescatter(raft::device_resources const& handle, int root)
 {
   return detail::test_collective_reducescatter(handle, root);
 }
@@ -115,7 +115,7 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
  *        initialized comms instance.
  * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_simple_send_recv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_simple_send_recv(h, numTrials);
 }
@@ -127,7 +127,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_send_or_recv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_device_send_or_recv(h, numTrials);
 }
@@ -139,7 +139,7 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_sendrecv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_device_sendrecv(h, numTrials);
 }
@@ -151,7 +151,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_multicast_sendrecv(raft::device_resources const& h, int numTrials)
 {
   return detail::test_pointToPoint_device_multicast_sendrecv(h, numTrials);
 }
@@ -163,6 +163,9 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t& h, int n_colors) { return detail::test_commsplit(h, n_colors); }
+bool test_commsplit(raft::device_resources const& h, int n_colors)
+{
+  return detail::test_commsplit(h, n_colors);
+}
 }  // namespace comms
 };  // namespace raft
diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index 508a9ce717..4062389eea 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@
 
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/util.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/error.hpp>
-#include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index 33892597d8..0db27f0a45 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <raft/comms/detail/ucp_helper.hpp>
 #include <raft/comms/detail/util.hpp>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/comms/detail/test.hpp b/cpp/include/raft/comms/detail/test.hpp
index 6ba4be3886..2b12bf2d2a 100644
--- a/cpp/include/raft/comms/detail/test.hpp
+++ b/cpp/include/raft/comms/detail/test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/comms/comms.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -38,7 +38,7 @@ namespace detail {
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allreduce(const handle_t& handle, int root)
+bool test_collective_allreduce(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -69,7 +69,7 @@ bool test_collective_allreduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_broadcast(const handle_t& handle, int root)
+bool test_collective_broadcast(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -104,7 +104,7 @@ bool test_collective_broadcast(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reduce(const handle_t& handle, int root)
+bool test_collective_reduce(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -140,7 +140,7 @@ bool test_collective_reduce(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_allgather(const handle_t& handle, int root)
+bool test_collective_allgather(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -177,7 +177,7 @@ bool test_collective_allgather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gather(const handle_t& handle, int root)
+bool test_collective_gather(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -214,7 +214,7 @@ bool test_collective_gather(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_gatherv(const handle_t& handle, int root)
+bool test_collective_gatherv(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -273,7 +273,7 @@ bool test_collective_gatherv(const handle_t& handle, int root)
  *        initialized comms instance.
  *  @param[in] root the root rank id
  */
-bool test_collective_reducescatter(const handle_t& handle, int root)
+bool test_collective_reducescatter(raft::device_resources const& handle, int root)
 {
   comms_t const& communicator = handle.get_comms();
 
@@ -308,7 +308,7 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
  *        initialized comms instance.
  * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_simple_send_recv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -373,7 +373,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_send_or_recv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -415,7 +415,7 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_sendrecv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -461,7 +461,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+bool test_pointToPoint_device_multicast_sendrecv(raft::device_resources const& h, int numTrials)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
@@ -520,7 +520,7 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t& h, int n_colors)
+bool test_commsplit(raft::device_resources const& h, int n_colors)
 {
   comms_t const& communicator = h.get_comms();
   int const rank              = communicator.get_rank();
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index b3ea62efd2..9076176ea6 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ using mpi_comms = detail::mpi_comms;
  * #include <raft/core/device_mdarray.hpp>
  *
  * MPI_Comm mpi_comm;
- * raft::handle_t handle;
+ * raft::raft::device_resources handle;
  *
  * initialize_mpi_comms(&handle, mpi_comm);
  * ...
@@ -55,7 +55,7 @@ using mpi_comms = detail::mpi_comms;
  * comm.sync_stream(handle.get_stream());
  * @endcode
  */
-inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
+inline void initialize_mpi_comms(device_resources* handle, MPI_Comm comm)
 {
   auto communicator = std::make_shared<comms_t>(
     std::unique_ptr<comms_iface>(new mpi_comms(comm, false, handle->get_stream())));
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 5e619053da..6370d4a8e6 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/std_comms.hpp>
@@ -39,7 +39,7 @@ using std_comms = detail::std_comms;
  * Factory function to construct a RAFT NCCL communicator and inject it into a
  * RAFT handle.
  *
- * @param handle raft::handle_t for injecting the comms
+ * @param handle raft::device_resources for injecting the comms
  * @param nccl_comm initialized NCCL communicator to use for collectives
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
@@ -49,7 +49,7 @@ using std_comms = detail::std_comms;
  * #include <raft/core/device_mdarray.hpp>
  *
  * ncclComm_t nccl_comm;
- * raft::handle_t handle;
+ * raft::raft::device_resources handle;
  *
  * build_comms_nccl_only(&handle, nccl_comm, 5, 0);
  * ...
@@ -64,7 +64,7 @@ using std_comms = detail::std_comms;
  * comm.sync_stream(handle.get_stream());
  * @endcode
  */
-void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+void build_comms_nccl_only(device_resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
 {
   cudaStream_t stream = handle->get_stream();
 
@@ -77,7 +77,7 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks
  * Factory function to construct a RAFT NCCL+UCX and inject it into a RAFT
  * handle.
  *
- * @param handle raft::handle_t for injecting the comms
+ * @param handle raft::device_resources for injecting the comms
  * @param nccl_comm initialized NCCL communicator to use for collectives
  * @param ucp_worker of local process
  *        Note: This is purposefully left as void* so that the ucp_worker_h
@@ -93,7 +93,7 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks
  * #include <raft/core/device_mdarray.hpp>
  *
  * ncclComm_t nccl_comm;
- * raft::handle_t handle;
+ * raft::raft::device_resources handle;
  * ucp_worker_h ucp_worker;
  * ucp_ep_h *ucp_endpoints_arr;
  *
@@ -110,8 +110,12 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks
  * comm.sync_stream(handle.get_stream());
  * @endcode
  */
-void build_comms_nccl_ucx(
-  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+void build_comms_nccl_ucx(device_resources* handle,
+                          ncclComm_t nccl_comm,
+                          void* ucp_worker,
+                          void* eps,
+                          int num_ranks,
+                          int rank)
 {
   auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
 
diff --git a/cpp/include/raft/core/comms.hpp b/cpp/include/raft/core/comms.hpp
index 35ab6680de..463c17f2f6 100644
--- a/cpp/include/raft/core/comms.hpp
+++ b/cpp/include/raft/core/comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
 #include <memory>
 #include <raft/core/error.hpp>
 #include <vector>
diff --git a/cpp/include/raft/core/detail/device_mdarray.hpp b/cpp/include/raft/core/detail/device_mdarray.hpp
index ad6831794e..31dfaba70a 100644
--- a/cpp/include/raft/core/detail/device_mdarray.hpp
+++ b/cpp/include/raft/core/detail/device_mdarray.hpp
@@ -6,7 +6,7 @@
  */
 
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
  */
 #pragma once
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <raft/core/detail/span.hpp>  // dynamic_extent
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 693e50a506..03cb09eecb 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ using device_matrix = device_mdarray<ElementType, matrix_extent<IndexType>, Layo
  * @tparam ElementType the data type of the matrix elements
  * @tparam IndexType the index type of the extents
  * @tparam LayoutPolicy policy for strides and layout ordering
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param exts dimensionality of the array (series of integers)
  * @return raft::device_mdarray
  */
@@ -80,7 +80,7 @@ template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous,
           size_t... Extents>
-auto make_device_mdarray(const raft::handle_t& handle, extents<IndexType, Extents...> exts)
+auto make_device_mdarray(raft::device_resources const& handle, extents<IndexType, Extents...> exts)
 {
   using mdarray_t = device_mdarray<ElementType, decltype(exts), LayoutPolicy>;
 
@@ -95,7 +95,7 @@ auto make_device_mdarray(const raft::handle_t& handle, extents<IndexType, Extent
  * @tparam ElementType the data type of the matrix elements
  * @tparam IndexType the index type of the extents
  * @tparam LayoutPolicy policy for strides and layout ordering
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param mr rmm memory resource used for allocating the memory for the array
  * @param exts dimensionality of the array (series of integers)
  * @return raft::device_mdarray
@@ -104,7 +104,7 @@ template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous,
           size_t... Extents>
-auto make_device_mdarray(const raft::handle_t& handle,
+auto make_device_mdarray(raft::device_resources const& handle,
                          rmm::mr::device_memory_resource* mr,
                          extents<IndexType, Extents...> exts)
 {
@@ -130,7 +130,7 @@ auto make_device_mdarray(const raft::handle_t& handle,
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_device_matrix(raft::handle_t const& handle, IndexType n_rows, IndexType n_cols)
+auto make_device_matrix(raft::device_resources const& handle, IndexType n_rows, IndexType n_cols)
 {
   return make_device_mdarray<ElementType, IndexType, LayoutPolicy>(
     handle.get_stream(), make_extents<IndexType>(n_rows, n_cols));
@@ -146,7 +146,7 @@ auto make_device_matrix(raft::handle_t const& handle, IndexType n_rows, IndexTyp
  * @return raft::device_scalar
  */
 template <typename ElementType, typename IndexType = std::uint32_t>
-auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+auto make_device_scalar(raft::device_resources const& handle, ElementType const& v)
 {
   scalar_extent<IndexType> extents;
   using policy_t = typename device_scalar<ElementType>::container_policy_type;
@@ -168,7 +168,7 @@ auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
 template <typename ElementType,
           typename IndexType    = std::uint32_t,
           typename LayoutPolicy = layout_c_contiguous>
-auto make_device_vector(raft::handle_t const& handle, IndexType n)
+auto make_device_vector(raft::device_resources const& handle, IndexType n)
 {
   return make_device_mdarray<ElementType, IndexType, LayoutPolicy>(handle.get_stream(),
                                                                    make_extents<IndexType>(n));
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index f64f15d0d5..f72ae36d64 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,7 +197,9 @@ auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexTy
                                                  detail::alignment::value>::data_handle_type;
   static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
                 std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
-  assert(ptr == alignTo(ptr, detail::alignment::value));
+  assert(reinterpret_cast<std::uintptr_t>(ptr) ==
+         std::experimental::details::alignTo(reinterpret_cast<std::uintptr_t>(ptr),
+                                             detail::alignment::value));
 
   data_handle_type aligned_pointer = ptr;
 
diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
new file mode 100644
index 0000000000..68c56dc9b6
--- /dev/null
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RAFT_DEVICE_RESOURCES
+#define __RAFT_DEVICE_RESOURCES
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+#include <raft/core/comms.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <raft/core/resource/comms.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_event.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/cusolver_sp_handle.hpp>
+#include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/device_properties.hpp>
+#include <raft/core/resource/sub_comms.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft {
+
+/**
+ * @brief Main resource container object that stores all necessary resources
+ * used for calling necessary device functions, cuda kernels and/or libraries
+ */
+class device_resources : public resources {
+ public:
+  device_resources(const device_resources& handle,
+                   rmm::mr::device_memory_resource* workspace_resource)
+    : resources{handle}
+  {
+    // replace the resource factory for the workspace_resources
+    resources::add_resource_factory(
+      std::make_shared<resource::workspace_resource_factory>(workspace_resource));
+  }
+
+  device_resources(const device_resources& handle) : resources{handle} {}
+
+  device_resources(device_resources&&) = delete;
+  device_resources& operator=(device_resources&&) = delete;
+
+  /**
+   * @brief Construct a resources instance with a stream view and stream pool
+   *
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   * @param[in] workspace_resource an optional resource used by some functions for allocating
+   *            temporary workspaces.
+   */
+  device_resources(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+                   std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr},
+                   rmm::mr::device_memory_resource* workspace_resource = nullptr)
+    : resources{}
+  {
+    resources::add_resource_factory(std::make_shared<resource::device_id_resource_factory>());
+    resources::add_resource_factory(
+      std::make_shared<resource::cuda_stream_resource_factory>(stream_view));
+    resources::add_resource_factory(
+      std::make_shared<resource::cuda_stream_pool_resource_factory>(stream_pool));
+    resources::add_resource_factory(
+      std::make_shared<resource::workspace_resource_factory>(workspace_resource));
+  }
+
+  /** Destroys all held-up resources */
+  virtual ~device_resources() {}
+
+  int get_device() const { return resource::get_device_id(*this); }
+
+  cublasHandle_t get_cublas_handle() const { return resource::get_cublas_handle(*this); }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
+    return resource::get_cusolver_dn_handle(*this);
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
+    return resource::get_cusolver_sp_handle(*this);
+  }
+
+  cusparseHandle_t get_cusparse_handle() const { return resource::get_cusparse_handle(*this); }
+
+  rmm::exec_policy& get_thrust_policy() const { return resource::get_thrust_policy(*this); }
+
+  /**
+   * @brief synchronize a stream on the current container
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { resource::sync_stream(*this, stream); }
+
+  /**
+   * @brief synchronize main stream on the current container
+   */
+  void sync_stream() const { resource::sync_stream(*this); }
+
+  /**
+   * @brief returns main stream on the current container
+   */
+  rmm::cuda_stream_view get_stream() const { return resource::get_cuda_stream(*this); }
+
+  /**
+   * @brief returns whether stream pool was initialized on the current container
+   */
+
+  bool is_stream_pool_initialized() const { return resource::is_stream_pool_initialized(*this); }
+
+  /**
+   * @brief returns stream pool on the current container
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
+  {
+    return resource::get_cuda_stream_pool(*this);
+  }
+
+  std::size_t get_stream_pool_size() const { return resource::get_stream_pool_size(*this); }
+
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
+  {
+    return resource::get_stream_from_stream_pool(*this);
+  }
+
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    return resource::get_stream_from_stream_pool(*this, stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on current container
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return resource::get_next_usable_stream(*this);
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on current container
+   *
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return resource::get_next_usable_stream(*this, stream_idx);
+  }
+
+  /**
+   * @brief synchronize the stream pool on the current container
+   */
+  void sync_stream_pool() const { return resource::sync_stream_pool(*this); }
+
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
+  {
+    return resource::sync_stream_pool(*this, stream_indices);
+  }
+
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const { return resource::wait_stream_pool_on_stream(*this); }
+
+  void set_comms(std::shared_ptr<comms::comms_t> communicator)
+  {
+    resource::set_comms(*this, communicator);
+  }
+
+  const comms::comms_t& get_comms() const { return resource::get_comms(*this); }
+
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
+    resource::set_subcomm(*this, key, subcomm);
+  }
+
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    return resource::get_subcomm(*this, key);
+  }
+
+  rmm::mr::device_memory_resource* get_workspace_resource() const
+  {
+    return resource::get_workspace_resource(*this);
+  }
+
+  bool comms_initialized() const { return resource::comms_initialized(*this); }
+
+  const cudaDeviceProp& get_device_properties() const
+  {
+    return resource::get_device_properties(*this);
+  }
+};  // class device_resources
+
+/**
+ * @brief RAII approach to synchronizing across all streams in the current container
+ */
+class stream_syncer {
+ public:
+  explicit stream_syncer(const device_resources& handle) : handle_(handle)
+  {
+    handle_.sync_stream();
+  }
+  ~stream_syncer()
+  {
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
+  }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const device_resources& handle_;
+};  // class stream_syncer
+
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 08cb812bb7..02efebec9e 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,326 +14,52 @@
  * limitations under the License.
  */
 
-#ifndef __RAFT_RT_HANDLE
-#define __RAFT_RT_HANDLE
-
 #pragma once
 
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <cusolverSp.h>
-#include <cusparse.h>
-
-///@todo: enable once we have migrated cuml-comms layer too
-//#include <common/cuml_comms_int.hpp>
-
-#include <raft/core/cudart_utils.hpp>
-
-#include <raft/core/comms.hpp>
-#include <raft/core/cublas_macros.hpp>
-#include <raft/core/cusolver_macros.hpp>
-#include <raft/core/cusparse_macros.hpp>
-#include <raft/core/interruptible.hpp>
-#include <rmm/cuda_stream_pool.hpp>
-#include <rmm/exec_policy.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 
 /**
- * @brief Main handle object that stores all necessary context used for calling
- *        necessary cuda kernels and/or libraries
+ * raft::handle_t is being kept around for backwards
+ * compatibility and will be removed in a future version.
+ *
+ * Extending the `raft::handle_t` instead of `using` to
+ * minimize needed changes downstream
+ * (e.g. existing forward declarations, etc...)
+ *
+ * Use of `raft::resources` or `raft::handle_t` is preferred.
  */
-class handle_t {
+class handle_t : public raft::device_resources {
  public:
-  // delete copy/move constructors and assignment operators as
-  // copying and moving underlying resources is unsafe
-  handle_t(const handle_t&) = delete;
-  handle_t& operator=(const handle_t&) = delete;
-  handle_t(handle_t&&)                 = delete;
+  handle_t(const handle_t& handle, rmm::mr::device_memory_resource* workspace_resource)
+    : device_resources(handle, workspace_resource)
+  {
+  }
+
+  handle_t(const handle_t& handle) : device_resources{handle} {}
+
+  handle_t(handle_t&&) = delete;
   handle_t& operator=(handle_t&&) = delete;
 
   /**
-   * @brief Construct a handle with a stream view and stream pool
+   * @brief Construct a resources instance with a stream view and stream pool
    *
    * @param[in] stream_view the default stream (which has the default per-thread stream if
    * unspecified)
    * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
+   * @param[in] workspace_resource an optional resource used by some functions for allocating
+   *            temporary workspaces.
    */
-  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
-           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
-    : dev_id_([]() -> int {
-        int cur_dev = -1;
-        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
-        return cur_dev;
-      }()),
-      stream_view_{stream_view},
-      stream_pool_{stream_pool}
+  handle_t(rmm::cuda_stream_view stream_view                   = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool  = {nullptr},
+           rmm::mr::device_memory_resource* workspace_resource = nullptr)
+    : device_resources{stream_view, stream_pool, workspace_resource}
   {
-    create_resources();
   }
 
   /** Destroys all held-up resources */
-  virtual ~handle_t() { destroy_resources(); }
-
-  int get_device() const { return dev_id_; }
-
-  cublasHandle_t get_cublas_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
-      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
-      cublas_initialized_ = true;
-    }
-    return cublas_handle_;
-  }
-
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
-      cusolver_dn_initialized_ = true;
-    }
-    return cusolver_dn_handle_;
-  }
-
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
-      cusolver_sp_initialized_ = true;
-    }
-    return cusolver_sp_handle_;
-  }
-
-  cusparseHandle_t get_cusparse_handle() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
-      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
-      cusparse_initialized_ = true;
-    }
-    return cusparse_handle_;
-  }
-
-  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
-
-  /**
-   * @brief synchronize a stream on the handle
-   */
-  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
-
-  /**
-   * @brief synchronize main stream on the handle
-   */
-  void sync_stream() const { sync_stream(stream_view_); }
-
-  /**
-   * @brief returns main stream on the handle
-   */
-  rmm::cuda_stream_view get_stream() const { return stream_view_; }
-
-  /**
-   * @brief returns whether stream pool was initialized on the handle
-   */
-
-  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
-
-  /**
-   * @brief returns stream pool on the handle
-   */
-  const rmm::cuda_stream_pool& get_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return *stream_pool_;
-  }
-
-  std::size_t get_stream_pool_size() const
-  {
-    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
-  }
-
-  /**
-   * @brief return stream from pool
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool() const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream();
-  }
-
-  /**
-   * @brief return stream from pool at index
-   */
-  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    return stream_pool_->get_stream(stream_idx);
-  }
-
-  /**
-   * @brief return stream from pool if size > 0, else main stream on handle
-   */
-  rmm::cuda_stream_view get_next_usable_stream() const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
-  }
-
-  /**
-   * @brief return stream from pool at index if size > 0, else main stream on handle
-   *
-   * @param[in] stream_idx the required index of the stream in the stream pool if available
-   */
-  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
-  {
-    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
-  }
-
-  /**
-   * @brief synchronize the stream pool on the handle
-   */
-  void sync_stream_pool() const
-  {
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      sync_stream(stream_pool_->get_stream(i));
-    }
-  }
-
-  /**
-   * @brief synchronize subset of stream pool
-   *
-   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
-   */
-  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
-  {
-    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
-    for (const auto& stream_index : stream_indices) {
-      sync_stream(stream_pool_->get_stream(stream_index));
-    }
-  }
-
-  /**
-   * @brief ask stream pool to wait on last event in main stream
-   */
-  void wait_stream_pool_on_stream() const
-  {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
-    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
-    }
-  }
-
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
-
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
-    return *communicator_;
-  }
-
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
-    subcomms_[key] = subcomm;
-  }
-
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
-
-    auto subcomm = subcomms_.at(key);
-
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
-
-    return *subcomm;
-  }
-
-  bool comms_initialized() const { return (nullptr != communicator_.get()); }
-
-  const cudaDeviceProp& get_device_properties() const
-  {
-    std::lock_guard<std::mutex> _(mutex_);
-    if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
-      device_prop_initialized_ = true;
-    }
-    return prop_;
-  }
-
- private:
-  std::shared_ptr<comms::comms_t> communicator_;
-  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
-
-  const int dev_id_;
-  mutable cublasHandle_t cublas_handle_;
-  mutable bool cublas_initialized_{false};
-  mutable cusolverDnHandle_t cusolver_dn_handle_;
-  mutable bool cusolver_dn_initialized_{false};
-  mutable cusolverSpHandle_t cusolver_sp_handle_;
-  mutable bool cusolver_sp_initialized_{false};
-  mutable cusparseHandle_t cusparse_handle_;
-  mutable bool cusparse_initialized_{false};
-  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
-  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
-  cudaEvent_t event_;
-  mutable cudaDeviceProp prop_;
-  mutable bool device_prop_initialized_{false};
-  mutable std::mutex mutex_;
-
-  void create_resources()
-  {
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
-
-    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
-
-  void destroy_resources()
-  {
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
-  }
-};  // class handle_t
-
-/**
- * @brief RAII approach to synchronizing across all streams in the handle
- */
-class stream_syncer {
- public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
-  ~stream_syncer()
-  {
-    handle_.wait_stream_pool_on_stream();
-    handle_.sync_stream_pool();
-  }
-
-  stream_syncer(const stream_syncer& other) = delete;
-  stream_syncer& operator=(const stream_syncer& other) = delete;
-
- private:
-  const handle_t& handle_;
-};  // class stream_syncer
-
-}  // namespace raft
+  ~handle_t() override {}
+};
 
-#endif
\ No newline at end of file
+}  // end NAMESPACE raft
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index 1a0ea6432f..a6cdec7a84 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,7 +144,9 @@ auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType
 
   static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
                 std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
-  assert(ptr == alignTo(ptr, detail::alignment::value));
+  assert(reinterpret_cast<std::uintptr_t>(ptr) ==
+         std::experimental::details::alignTo(reinterpret_cast<std::uintptr_t>(ptr),
+                                             detail::alignment::value));
   data_handle_type aligned_pointer = ptr;
 
   matrix_extent<IndexType> extents{n_rows, n_cols};
diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
index f6ea841dc4..8d3321eb77 100644
--- a/cpp/include/raft/core/kvp.hpp
+++ b/cpp/include/raft/core/kvp.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #ifdef _RAFT_HAS_CUDA
 #include <cub/cub.cuh>
+#include <raft/util/cuda_utils.cuh>
 #endif
 namespace raft {
 /**
@@ -58,5 +59,27 @@ struct KeyValuePair {
   {
     return (value != b.value) || (key != b.key);
   }
+
+  RAFT_INLINE_FUNCTION bool operator<(const KeyValuePair<_Key, _Value>& b) const
+  {
+    return (key < b.key) || ((key == b.key) && value < b.value);
+  }
+
+  RAFT_INLINE_FUNCTION bool operator>(const KeyValuePair<_Key, _Value>& b) const
+  {
+    return (key > b.key) || ((key == b.key) && value > b.value);
+  }
 };
+
+#ifdef _RAFT_HAS_CUDA
+template <typename _Key, typename _Value>
+RAFT_INLINE_FUNCTION KeyValuePair<_Key, _Value> shfl_xor(const KeyValuePair<_Key, _Value>& input,
+                                                         int laneMask,
+                                                         int width     = WarpSize,
+                                                         uint32_t mask = 0xffffffffu)
+{
+  return KeyValuePair<_Key, _Value>(shfl_xor(input.key, laneMask, width, mask),
+                                    shfl_xor(input.value, laneMask, width, mask));
+}
+#endif
 }  // end namespace raft
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
new file mode 100644
index 0000000000..c5f08b84b7
--- /dev/null
+++ b/cpp/include/raft/core/math.hpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#include <raft/core/detail/macros.hpp>
+
+namespace raft {
+
+/**
+ * @defgroup Absolute Absolute value
+ * @{
+ */
+template <typename T>
+RAFT_INLINE_FUNCTION auto abs(T x)
+  -> std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T> ||
+                        std::is_same_v<int, T> || std::is_same_v<long int, T> ||
+                        std::is_same_v<long long int, T>,
+                      T>
+{
+#ifdef __CUDA_ARCH__
+  return ::abs(x);
+#else
+  return std::abs(x);
+#endif
+}
+template <typename T>
+constexpr RAFT_INLINE_FUNCTION auto abs(T x)
+  -> std::enable_if_t<!std::is_same_v<float, T> && !std::is_same_v<double, T> &&
+                        !std::is_same_v<int, T> && !std::is_same_v<long int, T> &&
+                        !std::is_same_v<long long int, T>,
+                      T>
+{
+  return x < T{0} ? -x : x;
+}
+/** @} */
+
+/**
+ * @defgroup Trigonometry Trigonometry functions
+ * @{
+ */
+/** Inverse cosine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto acos(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::acos(x);
+#else
+  return std::acos(x);
+#endif
+}
+
+/** Inverse sine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto asin(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::asin(x);
+#else
+  return std::asin(x);
+#endif
+}
+
+/** Inverse hyperbolic tangent */
+template <typename T>
+RAFT_INLINE_FUNCTION auto atanh(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::atanh(x);
+#else
+  return std::atanh(x);
+#endif
+}
+
+/** Cosine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto cos(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::cos(x);
+#else
+  return std::cos(x);
+#endif
+}
+
+/** Sine */
+template <typename T>
+RAFT_INLINE_FUNCTION auto sin(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::sin(x);
+#else
+  return std::sin(x);
+#endif
+}
+
+/** Sine and cosine */
+template <typename T>
+RAFT_INLINE_FUNCTION std::enable_if_t<std::is_same_v<float, T> || std::is_same_v<double, T>> sincos(
+  const T& x, T* s, T* c)
+{
+#ifdef __CUDA_ARCH__
+  ::sincos(x, s, c);
+#else
+  *s = std::sin(x);
+  *c = std::cos(x);
+#endif
+}
+
+/** Hyperbolic tangent */
+template <typename T>
+RAFT_INLINE_FUNCTION auto tanh(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::tanh(x);
+#else
+  return std::tanh(x);
+#endif
+}
+/** @} */
+
+/**
+ * @defgroup Exponential Exponential and logarithm
+ * @{
+ */
+/** Exponential function */
+template <typename T>
+RAFT_INLINE_FUNCTION auto exp(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::exp(x);
+#else
+  return std::exp(x);
+#endif
+}
+
+/** Natural logarithm */
+template <typename T>
+RAFT_INLINE_FUNCTION auto log(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::log(x);
+#else
+  return std::log(x);
+#endif
+}
+/** @} */
+
+/**
+ * @defgroup Maximum Maximum of two or more values.
+ *
+ * The CUDA Math API has overloads for all combinations of float/double. We provide similar
+ * functionality while wrapping around std::max, which only supports arguments of the same type.
+ * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+ * very error-prone so we do not support that and require the user to cast instead. (e.g the max of
+ * -1 and 1u is 4294967295u...)
+ *
+ * When no overload matches, we provide a generic implementation but require that both types be the
+ * same (and that the less-than operator be defined).
+ * @{
+ */
+template <typename T1, typename T2>
+RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
+{
+#ifdef __CUDA_ARCH__
+  // Combinations of types supported by the CUDA Math API
+  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+                ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
+                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+    return ::max(x, y);
+  }
+  // Else, check that the types are the same and provide a generic implementation
+  else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "No native max overload for these types. Both argument types must be the same to use "
+      "the generic max. Please cast appropriately.");
+    return (x < y) ? y : x;
+  }
+#else
+  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+    return std::max(static_cast<double>(x), y);
+  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+    return std::max(x, static_cast<double>(y));
+  } else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "std::max requires that both argument types be the same. Please cast appropriately.");
+    return std::max(x, y);
+  }
+#endif
+}
+
+/** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
+template <typename T1, typename T2, typename... Args>
+RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args)
+{
+  return raft::max(x, raft::max(y, std::forward<Args>(args)...));
+}
+
+/** One-argument overload for convenience when using with variadic arguments */
+template <typename T>
+constexpr RAFT_INLINE_FUNCTION auto max(const T& x)
+{
+  return x;
+}
+/** @} */
+
+/**
+ * @defgroup Minimum Minimum of two or more values.
+ *
+ * The CUDA Math API has overloads for all combinations of float/double. We provide similar
+ * functionality while wrapping around std::min, which only supports arguments of the same type.
+ * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is
+ * very error-prone so we do not support that and require the user to cast instead. (e.g the min of
+ * -1 and 1u is 1u...)
+ *
+ * When no overload matches, we provide a generic implementation but require that both types be the
+ * same (and that the less-than operator be defined).
+ * @{
+ */
+template <typename T1, typename T2>
+RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y)
+{
+#ifdef __CUDA_ARCH__
+  // Combinations of types supported by the CUDA Math API
+  if constexpr ((std::is_integral_v<T1> && std::is_integral_v<T2> && std::is_same_v<T1, T2>) ||
+                ((std::is_same_v<T1, float> || std::is_same_v<T1, double>)&&(
+                  std::is_same_v<T2, float> || std::is_same_v<T2, double>))) {
+    return ::min(x, y);
+  }
+  // Else, check that the types are the same and provide a generic implementation
+  else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "No native min overload for these types. Both argument types must be the same to use "
+      "the generic min. Please cast appropriately.");
+    return (y < x) ? y : x;
+  }
+#else
+  if constexpr (std::is_same_v<T1, float> && std::is_same_v<T2, double>) {
+    return std::min(static_cast<double>(x), y);
+  } else if constexpr (std::is_same_v<T1, double> && std::is_same_v<T2, float>) {
+    return std::min(x, static_cast<double>(y));
+  } else {
+    static_assert(
+      std::is_same_v<T1, T2>,
+      "std::min requires that both argument types be the same. Please cast appropriately.");
+    return std::min(x, y);
+  }
+#endif
+}
+
+/** Many-argument overload to avoid verbose nested calls or use with variadic arguments */
+template <typename T1, typename T2, typename... Args>
+RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args)
+{
+  return raft::min(x, raft::min(y, std::forward<Args>(args)...));
+}
+
+/** One-argument overload for convenience when using with variadic arguments */
+template <typename T>
+constexpr RAFT_INLINE_FUNCTION auto min(const T& x)
+{
+  return x;
+}
+/** @} */
+
+/**
+ * @defgroup Power Power and root functions
+ * @{
+ */
+/** Power */
+template <typename T1, typename T2>
+RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y)
+{
+#ifdef __CUDA_ARCH__
+  return ::pow(x, y);
+#else
+  return std::pow(x, y);
+#endif
+}
+
+/** Square root */
+template <typename T>
+RAFT_INLINE_FUNCTION auto sqrt(T x)
+{
+#ifdef __CUDA_ARCH__
+  return ::sqrt(x);
+#else
+  return std::sqrt(x);
+#endif
+}
+/** @} */
+
+/** Sign */
+template <typename T>
+RAFT_INLINE_FUNCTION auto sgn(T val) -> int
+{
+  return (T(0) < val) - (val < T(0));
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp
index 786ce69f89..f805d20064 100644
--- a/cpp/include/raft/core/mdspan.hpp
+++ b/cpp/include/raft/core/mdspan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -304,4 +304,52 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx,
   }
 }
 
+/**
+ * @brief Const accessor specialization for default_accessor
+ *
+ * @tparam ElementType
+ * @param a
+ * @return std::experimental::default_accessor<std::add_const_t<ElementType>>
+ */
+template <class ElementType>
+std::experimental::default_accessor<std::add_const_t<ElementType>> accessor_of_const(
+  std::experimental::default_accessor<ElementType> a)
+{
+  return {a};
+}
+
+/**
+ * @brief Const accessor specialization for host_device_accessor
+ *
+ * @tparam ElementType the data type of the mdspan elements
+ * @tparam MemType the type of memory where the elements are stored.
+ * @param a host_device_accessor
+ * @return host_device_accessor<std::experimental::default_accessor<std::add_const_t<ElementType>>,
+ * MemType>
+ */
+template <class ElementType, memory_type MemType>
+host_device_accessor<std::experimental::default_accessor<std::add_const_t<ElementType>>, MemType>
+accessor_of_const(host_device_accessor<std::experimental::default_accessor<ElementType>, MemType> a)
+{
+  return {a};
+}
+
+/**
+ * @brief Create a copy of the given mdspan with const element type
+ *
+ * @tparam ElementType the const-qualified data type of the mdspan elements
+ * @tparam Extents raft::extents for dimensions
+ * @tparam Layout policy for strides and layout ordering
+ * @tparam Accessor Accessor policy for the input and output
+ * @param mds raft::mdspan object
+ * @return raft::mdspan
+ */
+template <class ElementType, class Extents, class Layout, class Accessor>
+auto make_const_mdspan(mdspan<ElementType, Extents, Layout, Accessor> mds)
+{
+  auto acc_c = accessor_of_const(mds.accessor());
+  return mdspan<std::add_const_t<ElementType>, Extents, Layout, decltype(acc_c)>{
+    mds.data_handle(), mds.mapping(), acc_c};
+}
+
 }  // namespace raft
diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp
index de521cc945..7acc907c49 100644
--- a/cpp/include/raft/core/operators.hpp
+++ b/cpp/include/raft/core/operators.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <utility>
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/math.hpp>
 
 namespace raft {
 
@@ -40,6 +41,14 @@ struct identity_op {
   }
 };
 
+struct void_op {
+  template <typename... UnusedArgs>
+  constexpr RAFT_INLINE_FUNCTION void operator()(UnusedArgs...) const
+  {
+    return;
+  }
+};
+
 template <typename OutT>
 struct cast_op {
   template <typename InT, typename... UnusedArgs>
@@ -67,9 +76,9 @@ struct value_op {
 
 struct sqrt_op {
   template <typename Type, typename... UnusedArgs>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
   {
-    return std::sqrt(in);
+    return raft::sqrt(in);
   }
 };
 
@@ -83,9 +92,9 @@ struct nz_op {
 
 struct abs_op {
   template <typename Type, typename... UnusedArgs>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
+  RAFT_INLINE_FUNCTION auto operator()(const Type& in, UnusedArgs...) const
   {
-    return std::abs(in);
+    return raft::abs(in);
   }
 };
 
@@ -130,37 +139,43 @@ struct div_op {
 };
 
 struct div_checkzero_op {
-  template <typename Type>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
   {
-    if (b == Type{0}) { return Type{0}; }
+    if (b == T2{0}) { return T1{0} / T2{1}; }
     return a / b;
   }
 };
 
 struct pow_op {
   template <typename Type>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  {
+    return raft::pow(a, b);
+  }
+};
+
+struct mod_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
   {
-    return std::pow(a, b);
+    return a % b;
   }
 };
 
 struct min_op {
-  template <typename Type>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  template <typename... Args>
+  RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
   {
-    if (a > b) { return b; }
-    return a;
+    return raft::min(std::forward<Args>(args)...);
   }
 };
 
 struct max_op {
-  template <typename Type>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  template <typename... Args>
+  RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
   {
-    if (b > a) { return b; }
-    return a;
+    return raft::max(std::forward<Args>(args)...);
   }
 };
 
@@ -182,17 +197,49 @@ struct argmax_op {
   }
 };
 
+struct greater_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a > b;
+  }
+};
+
+struct less_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a < b;
+  }
+};
+
+struct greater_or_equal_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a >= b;
+  }
+};
+
+struct less_or_equal_op {
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
+  {
+    return a <= b;
+  }
+};
+
 struct equal_op {
-  template <typename Type>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
   {
     return a == b;
   }
 };
 
 struct notequal_op {
-  template <typename Type>
-  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& a, const Type& b) const
+  template <typename T1, typename T2>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
   {
     return a != b;
   }
@@ -263,6 +310,15 @@ using div_checkzero_const_op = plug_const_op<Type, div_checkzero_op>;
 template <typename Type>
 using pow_const_op = plug_const_op<Type, pow_op>;
 
+template <typename Type>
+using mod_const_op = plug_const_op<Type, mod_op>;
+
+template <typename Type>
+using mod_const_op = plug_const_op<Type, mod_op>;
+
+template <typename Type>
+using equal_const_op = plug_const_op<Type, equal_op>;
+
 /**
  * @brief Constructs an operator by composing a chain of operators.
  *
diff --git a/cpp/include/raft/core/resource/comms.hpp b/cpp/include/raft/core/resource/comms.hpp
new file mode 100644
index 0000000000..73de166c14
--- /dev/null
+++ b/cpp/include/raft/core/resource/comms.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/comms.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+class comms_resource : public resource {
+ public:
+  comms_resource(std::shared_ptr<comms::comms_t> comnumicator) : communicator_(comnumicator) {}
+
+  void* get_resource() override { return &communicator_; }
+
+  ~comms_resource() override {}
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class comms_resource_factory : public resource_factory {
+ public:
+  comms_resource_factory(std::shared_ptr<comms::comms_t> communicator) : communicator_(communicator)
+  {
+  }
+
+  resource_type get_resource_type() override { return resource_type::COMMUNICATOR; }
+
+  resource* make_resource() override { return new comms_resource(communicator_); }
+
+ private:
+  std::shared_ptr<comms::comms_t> communicator_;
+};
+
+/**
+ * @defgroup resource_comms Comms resource functions
+ * @{
+ */
+
+inline bool comms_initialized(resources const& res)
+{
+  return res.has_resource_factory(resource_type::COMMUNICATOR);
+}
+
+inline comms::comms_t const& get_comms(resources const& res)
+{
+  RAFT_EXPECTS(comms_initialized(res), "ERROR: Communicator was not initialized\n");
+  return *(*res.get_resource<std::shared_ptr<comms::comms_t>>(resource_type::COMMUNICATOR));
+}
+
+inline void set_comms(resources const& res, std::shared_ptr<comms::comms_t> communicator)
+{
+  res.add_resource_factory(std::make_shared<comms_resource_factory>(communicator));
+}
+
+/**
+ * @}
+ */
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cublas_handle.hpp b/cpp/include/raft/core/resource/cublas_handle.hpp
new file mode 100644
index 0000000000..710fcc7e60
--- /dev/null
+++ b/cpp/include/raft/core/resource/cublas_handle.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/core/cublas_macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+
+class cublas_resource : public resource {
+ public:
+  cublas_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_res));
+    RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_res, stream));
+  }
+
+  ~cublas_resource() override { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_res)); }
+
+  void* get_resource() override { return &cublas_res; }
+
+ private:
+  cublasHandle_t cublas_res;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cublas_resource_factory : public resource_factory {
+ public:
+  cublas_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUBLAS_HANDLE; }
+  resource* make_resource() override { return new cublas_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * @defgroup resource_cublas cuBLAS handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cublasres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cublas handle
+ */
+inline cublasHandle_t get_cublas_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUBLAS_HANDLE)) {
+    cudaStream_t stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cublas_resource_factory>(stream));
+  }
+  return *res.get_resource<cublasHandle_t>(resource_type::CUBLAS_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cuda_event.hpp b/cpp/include/raft/core/resource/cuda_event.hpp
new file mode 100644
index 0000000000..4859d95ee9
--- /dev/null
+++ b/cpp/include/raft/core/resource/cuda_event.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource {
+
+class cuda_event_resource : public resource {
+ public:
+  cuda_event_resource()
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+  void* get_resource() override { return &event_; }
+
+  ~cuda_event_resource() override { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_)); }
+
+ private:
+  cudaEvent_t event_;
+};
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cuda_stream.hpp b/cpp/include/raft/core/resource/cuda_stream.hpp
new file mode 100644
index 0000000000..318252199e
--- /dev/null
+++ b/cpp/include/raft/core/resource/cuda_stream.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/interruptible.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::resource {
+class cuda_stream_resource : public resource {
+ public:
+  cuda_stream_resource(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread)
+    : stream(stream_view)
+  {
+  }
+  void* get_resource() override { return &stream; }
+
+  ~cuda_stream_resource() override {}
+
+ private:
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+class cuda_stream_resource_factory : public resource_factory {
+ public:
+  cuda_stream_resource_factory(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread)
+    : stream(stream_view)
+  {
+  }
+  resource_type get_resource_type() override { return resource_type::CUDA_STREAM_VIEW; }
+  resource* make_resource() override { return new cuda_stream_resource(stream); }
+
+ private:
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * @defgroup resource_cuda_stream CUDA stream resource functions
+ * @{
+ */
+/**
+ * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res
+ * if needed).
+ * @param res raft res object for managing resources
+ * @return
+ */
+inline rmm::cuda_stream_view get_cuda_stream(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUDA_STREAM_VIEW)) {
+    res.add_resource_factory(std::make_shared<cuda_stream_resource_factory>());
+  }
+  return *res.get_resource<rmm::cuda_stream_view>(resource_type::CUDA_STREAM_VIEW);
+};
+
+/**
+ * Load a rmm::cuda_stream_view from a resources instance (and populate it on the res
+ * if needed).
+ * @param[in] res raft resources object for managing resources
+ * @param[in] stream_view cuda stream view
+ */
+inline void set_cuda_stream(resources const& res, rmm::cuda_stream_view stream_view)
+{
+  res.add_resource_factory(std::make_shared<cuda_stream_resource_factory>(stream_view));
+};
+
+/**
+ * @brief synchronize a specific stream
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream stream to synchronize
+ */
+inline void sync_stream(const resources& res, rmm::cuda_stream_view stream)
+{
+  interruptible::synchronize(stream);
+}
+
+/**
+ * @brief synchronize main stream on the resources instance
+ */
+inline void sync_stream(const resources& res) { sync_stream(res, get_cuda_stream(res)); }
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/cuda_stream_pool.hpp b/cpp/include/raft/core/resource/cuda_stream_pool.hpp
new file mode 100644
index 0000000000..dbce75b3a5
--- /dev/null
+++ b/cpp/include/raft/core/resource/cuda_stream_pool.hpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/cuda_event.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/detail/stream_sync_event.hpp>
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+namespace raft::resource {
+
+class cuda_stream_pool_resource : public resource {
+ public:
+  cuda_stream_pool_resource(std::shared_ptr<rmm::cuda_stream_pool> stream_pool)
+    : stream_pool_(stream_pool)
+  {
+  }
+
+  ~cuda_stream_pool_resource() override {}
+  void* get_resource() override { return &stream_pool_; }
+
+ private:
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cuda_stream_pool_resource_factory : public resource_factory {
+ public:
+  cuda_stream_pool_resource_factory(std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
+    : stream_pool_(stream_pool)
+  {
+  }
+
+  resource_type get_resource_type() override { return resource_type::CUDA_STREAM_POOL; }
+  resource* make_resource() override { return new cuda_stream_pool_resource(stream_pool_); }
+
+ private:
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
+};
+
+inline bool is_stream_pool_initialized(const resources& res)
+{
+  return *res.get_resource<std::shared_ptr<rmm::cuda_stream_pool>>(
+           resource_type::CUDA_STREAM_POOL) != nullptr;
+}
+
+/**
+ * @defgroup resource_stream_pool CUDA Stream pool resource functions
+ * @{
+ */
+
+/**
+ * Load a cuda_stream_pool, and create a new one if it doesn't already exist
+ * @param res raft res object for managing resources
+ * @return
+ */
+inline const rmm::cuda_stream_pool& get_cuda_stream_pool(const resources& res)
+{
+  if (!res.has_resource_factory(resource_type::CUDA_STREAM_POOL)) {
+    res.add_resource_factory(std::make_shared<cuda_stream_pool_resource_factory>());
+  }
+  return *(
+    *res.get_resource<std::shared_ptr<rmm::cuda_stream_pool>>(resource_type::CUDA_STREAM_POOL));
+};
+
+/**
+ * Explicitly set a stream pool on the current res. Note that this will overwrite
+ * an existing stream pool on the res.
+ * @param res
+ * @param stream_pool
+ */
+inline void set_cuda_stream_pool(const resources& res,
+                                 std::shared_ptr<rmm::cuda_stream_pool> stream_pool)
+{
+  res.add_resource_factory(std::make_shared<cuda_stream_pool_resource_factory>(stream_pool));
+};
+
+inline std::size_t get_stream_pool_size(const resources& res)
+{
+  return is_stream_pool_initialized(res) ? get_cuda_stream_pool(res).get_pool_size() : 0;
+}
+
+/**
+ * @brief return stream from pool
+ */
+inline rmm::cuda_stream_view get_stream_from_stream_pool(const resources& res)
+{
+  RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized");
+  return get_cuda_stream_pool(res).get_stream();
+}
+
+/**
+ * @brief return stream from pool at index
+ */
+inline rmm::cuda_stream_view get_stream_from_stream_pool(const resources& res,
+                                                         std::size_t stream_idx)
+{
+  RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized");
+  return get_cuda_stream_pool(res).get_stream(stream_idx);
+}
+
+/**
+ * @brief return stream from pool if size > 0, else main stream on res
+ */
+inline rmm::cuda_stream_view get_next_usable_stream(const resources& res)
+{
+  return is_stream_pool_initialized(res) ? get_stream_from_stream_pool(res) : get_cuda_stream(res);
+}
+
+/**
+ * @brief return stream from pool at index if size > 0, else main stream on res
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream_idx the required index of the stream in the stream pool if available
+ */
+inline rmm::cuda_stream_view get_next_usable_stream(const resources& res, std::size_t stream_idx)
+{
+  return is_stream_pool_initialized(res) ? get_stream_from_stream_pool(res, stream_idx)
+                                         : get_cuda_stream(res);
+}
+
+/**
+ * @brief synchronize the stream pool on the res
+ *
+ * @param[in] res the raft resources object
+ */
+inline void sync_stream_pool(const resources& res)
+{
+  for (std::size_t i = 0; i < get_stream_pool_size(res); i++) {
+    sync_stream(res, get_cuda_stream_pool(res).get_stream(i));
+  }
+}
+
+/**
+ * @brief synchronize subset of stream pool
+ *
+ * @param[in] res the raft resources object
+ * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+ */
+inline void sync_stream_pool(const resources& res, const std::vector<std::size_t> stream_indices)
+{
+  RAFT_EXPECTS(is_stream_pool_initialized(res), "ERROR: rmm::cuda_stream_pool was not initialized");
+  for (const auto& stream_index : stream_indices) {
+    sync_stream(res, get_cuda_stream_pool(res).get_stream(stream_index));
+  }
+}
+
+/**
+ * @brief ask stream pool to wait on last event in main stream
+ *
+ * @param[in] res the raft resources object
+ */
+inline void wait_stream_pool_on_stream(const resources& res)
+{
+  cudaEvent_t event = detail::get_cuda_stream_sync_event(res);
+  RAFT_CUDA_TRY(cudaEventRecord(event, get_cuda_stream(res)));
+  for (std::size_t i = 0; i < get_stream_pool_size(res); i++) {
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(get_cuda_stream_pool(res).get_stream(i), event, 0));
+  }
+}
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cusolver_dn_handle.hpp b/cpp/include/raft/core/resource/cusolver_dn_handle.hpp
new file mode 100644
index 0000000000..7a33e2dd2a
--- /dev/null
+++ b/cpp/include/raft/core/resource/cusolver_dn_handle.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cuda_stream.hpp"
+#include <cusolverDn.h>
+#include <raft/core/cusolver_macros.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::resource {
+
+/**
+ *
+ */
+class cusolver_dn_resource : public resource {
+ public:
+  cusolver_dn_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_res));
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_res, stream));
+  }
+
+  void* get_resource() override { return &cusolver_res; }
+
+  ~cusolver_dn_resource() override { RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_res)); }
+
+ private:
+  cusolverDnHandle_t cusolver_res;
+};
+
+/**
+ * @defgroup resource_cusolver_dn cuSolver DN handle resource functions
+ * @{
+ */
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cusolver_dn_resource_factory : public resource_factory {
+ public:
+  cusolver_dn_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUSOLVER_DN_HANDLE; }
+  resource* make_resource() override { return new cusolver_dn_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * Load a cusolverSpres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cusolver dn handle
+ */
+inline cusolverDnHandle_t get_cusolver_dn_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUSOLVER_DN_HANDLE)) {
+    cudaStream_t stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cusolver_dn_resource_factory>(stream));
+  }
+  return *res.get_resource<cusolverDnHandle_t>(resource_type::CUSOLVER_DN_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cusolver_sp_handle.hpp b/cpp/include/raft/core/resource/cusolver_sp_handle.hpp
new file mode 100644
index 0000000000..61fd95b44f
--- /dev/null
+++ b/cpp/include/raft/core/resource/cusolver_sp_handle.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cusolverSp.h>
+#include <raft/core/cusolver_macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+
+/**
+ *
+ */
+class cusolver_sp_resource : public resource {
+ public:
+  cusolver_sp_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_res));
+    RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_res, stream));
+  }
+
+  void* get_resource() override { return &cusolver_res; }
+
+  ~cusolver_sp_resource() override { RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_res)); }
+
+ private:
+  cusolverSpHandle_t cusolver_res;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cusolver_sp_resource_factory : public resource_factory {
+ public:
+  cusolver_sp_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUSOLVER_SP_HANDLE; }
+  resource* make_resource() override { return new cusolver_sp_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * @defgroup resource_cusolver_sp cuSolver SP handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cusolverSpres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cusolver sp handle
+ */
+inline cusolverSpHandle_t get_cusolver_sp_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUSOLVER_SP_HANDLE)) {
+    cudaStream_t stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cusolver_sp_resource_factory>(stream));
+  }
+  return *res.get_resource<cusolverSpHandle_t>(resource_type::CUSOLVER_SP_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/cusparse_handle.hpp b/cpp/include/raft/core/resource/cusparse_handle.hpp
new file mode 100644
index 0000000000..9893ed2f86
--- /dev/null
+++ b/cpp/include/raft/core/resource/cusparse_handle.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/core/cusparse_macros.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+class cusparse_resource : public resource {
+ public:
+  cusparse_resource(rmm::cuda_stream_view stream)
+  {
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_res));
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_res, stream));
+  }
+
+  ~cusparse_resource() { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_res)); }
+  void* get_resource() override { return &cusparse_res; }
+
+ private:
+  cusparseHandle_t cusparse_res;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class cusparse_resource_factory : public resource_factory {
+ public:
+  cusparse_resource_factory(rmm::cuda_stream_view stream) : stream_(stream) {}
+  resource_type get_resource_type() override { return resource_type::CUSPARSE_HANDLE; }
+  resource* make_resource() override { return new cusparse_resource(stream_); }
+
+ private:
+  rmm::cuda_stream_view stream_;
+};
+
+/**
+ * @defgroup resource_cusparse cuSparse handle resource functions
+ * @{
+ */
+
+/**
+ * Load a cusparseres_t from raft res if it exists, otherwise
+ * add it and return it.
+ * @param[in] res the raft resources object
+ * @return cusparse handle
+ */
+inline cusparseHandle_t get_cusparse_handle(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUSPARSE_HANDLE)) {
+    rmm::cuda_stream_view stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<cusparse_resource_factory>(stream));
+  }
+  return *res.get_resource<cusparseHandle_t>(resource_type::CUSPARSE_HANDLE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/detail/stream_sync_event.hpp b/cpp/include/raft/core/resource/detail/stream_sync_event.hpp
new file mode 100644
index 0000000000..1d02fef20d
--- /dev/null
+++ b/cpp/include/raft/core/resource/detail/stream_sync_event.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/cuda_event.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource::detail {
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the res_t.
+ */
+class cuda_stream_sync_event_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::CUDA_STREAM_SYNC_EVENT; }
+  resource* make_resource() override { return new cuda_event_resource(); }
+};
+
+/**
+ * Load a cudaEvent from a resources instance (and populate it on the resources instance)
+ * if needed) for syncing the main cuda stream.
+ * @param res raft resources instance for managing resources
+ * @return
+ */
+inline cudaEvent_t& get_cuda_stream_sync_event(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::CUDA_STREAM_SYNC_EVENT)) {
+    res.add_resource_factory(std::make_shared<cuda_stream_sync_event_resource_factory>());
+  }
+  return *res.get_resource<cudaEvent_t>(resource_type::CUDA_STREAM_SYNC_EVENT);
+};
+
+}  // namespace raft::resource::detail
diff --git a/cpp/include/raft/core/resource/device_id.hpp b/cpp/include/raft/core/resource/device_id.hpp
new file mode 100644
index 0000000000..b55e56ca45
--- /dev/null
+++ b/cpp/include/raft/core/resource/device_id.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource {
+
+class device_id_resource : public resource {
+ public:
+  device_id_resource()
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        RAFT_CUDA_TRY_NO_THROW(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }())
+  {
+  }
+  void* get_resource() override { return &dev_id_; }
+
+  ~device_id_resource() override {}
+
+ private:
+  int dev_id_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class device_id_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::DEVICE_ID; }
+  resource* make_resource() override { return new device_id_resource(); }
+};
+
+/**
+ * @defgroup resource_device_id Device ID resource functions
+ * @{
+ */
+
+/**
+ * Load a device id from a res (and populate it on the res if needed).
+ * @param res raft res object for managing resources
+ * @return device id
+ */
+inline int get_device_id(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::DEVICE_ID)) {
+    res.add_resource_factory(std::make_shared<device_id_resource_factory>());
+  }
+  return *res.get_resource<int>(resource_type::DEVICE_ID);
+};
+
+/**
+ * @}
+ */
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
new file mode 100644
index 0000000000..35ae3d715f
--- /dev/null
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::resource {
+class device_memory_resource : public resource {
+ public:
+  device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_)
+  {
+    if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+  }
+  void* get_resource() override { return mr; }
+
+  ~device_memory_resource() override {}
+
+ private:
+  rmm::mr::device_memory_resource* mr;
+};
+
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+class workspace_resource_factory : public resource_factory {
+ public:
+  workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {}
+  resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; }
+  resource* make_resource() override { return new device_memory_resource(mr); }
+
+ private:
+  rmm::mr::device_memory_resource* mr;
+};
+
+/**
+ * Load a temp workspace resource from a resources instance (and populate it on the res
+ * if needed).
+ * @param res raft resources object for managing resources
+ * @return device memory resource object
+ */
+inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) {
+    res.add_resource_factory(std::make_shared<workspace_resource_factory>());
+  }
+  return res.get_resource<rmm::mr::device_memory_resource>(resource_type::WORKSPACE_RESOURCE);
+};
+
+/**
+ * Set a temp workspace resource on a resources instance.
+ *
+ * @param res raft resources object for managing resources
+ * @param mr a valid rmm device_memory_resource
+ */
+inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr)
+{
+  res.add_resource_factory(std::make_shared<workspace_resource_factory>(mr));
+};
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/device_properties.hpp b/cpp/include/raft/core/resource/device_properties.hpp
new file mode 100644
index 0000000000..c3b0b8f2b9
--- /dev/null
+++ b/cpp/include/raft/core/resource/device_properties.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::resource {
+
+class device_properties_resource : public resource {
+ public:
+  device_properties_resource(int dev_id)
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id));
+  }
+  void* get_resource() override { return &prop_; }
+
+  ~device_properties_resource() override {}
+
+ private:
+  cudaDeviceProp prop_;
+};
+
+/**
+ * @defgroup resource_device_props Device properties resource functions
+ * @{
+ */
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class device_properties_resource_factory : public resource_factory {
+ public:
+  device_properties_resource_factory(int dev_id) : dev_id_(dev_id) {}
+  resource_type get_resource_type() override { return resource_type::DEVICE_PROPERTIES; }
+  resource* make_resource() override { return new device_properties_resource(dev_id_); }
+
+ private:
+  int dev_id_;
+};
+
+/**
+ * Load a cudaDeviceProp from a res (and populate it on the res if needed).
+ * @param res raft res object for managing resources
+ * @return populated cuda device properties instance
+ */
+inline cudaDeviceProp& get_device_properties(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::DEVICE_PROPERTIES)) {
+    int dev_id = get_device_id(res);
+    res.add_resource_factory(std::make_shared<device_properties_resource_factory>(dev_id));
+  }
+  return *res.get_resource<cudaDeviceProp>(resource_type::DEVICE_PROPERTIES);
+};
+
+/**
+ * @}
+ */
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
new file mode 100644
index 0000000000..cf302e25f9
--- /dev/null
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::resource {
+
+/**
+ * @defgroup resource_types Core resource vocabulary types
+ * @{
+ */
+
+/**
+ * @brief Resource types can apply to any resource and don't have to be host- or device-specific.
+ */
+enum resource_type {
+  // device-specific resource types
+  CUBLAS_HANDLE = 0,       // cublas handle
+  CUSOLVER_DN_HANDLE,      // cusolver dn handle
+  CUSOLVER_SP_HANDLE,      // cusolver sp handle
+  CUSPARSE_HANDLE,         // cusparse handle
+  CUDA_STREAM_VIEW,        // view of a cuda stream
+  CUDA_STREAM_POOL,        // cuda stream pool
+  CUDA_STREAM_SYNC_EVENT,  // cuda event for syncing streams
+  COMMUNICATOR,            // raft communicator
+  SUB_COMMUNICATOR,        // raft sub communicator
+  DEVICE_PROPERTIES,       // cuda device properties
+  DEVICE_ID,               // cuda device id
+  THRUST_POLICY,           // thrust execution policy
+  WORKSPACE_RESOURCE,      // rmm device memory resource
+
+  LAST_KEY  // reserved for the last key
+};
+
+/**
+ * @brief A resource constructs and contains an instance of
+ * some pre-determined object type and facades that object
+ * behind a common API.
+ */
+class resource {
+ public:
+  virtual void* get_resource() = 0;
+
+  virtual ~resource() {}
+};
+
+class empty_resource : public resource {
+ public:
+  empty_resource() : resource() {}
+
+  void* get_resource() override { return nullptr; }
+
+  ~empty_resource() override {}
+};
+
+/**
+ * @brief A resource factory knows how to construct an instance of
+ * a specific raft::resource::resource.
+ */
+class resource_factory {
+ public:
+  /**
+   * @brief Return the resource_type associated with the current factory
+   * @return resource_type corresponding to the current factory
+   */
+  virtual resource_type get_resource_type() = 0;
+
+  /**
+   * @brief Construct an instance of the factory's underlying resource.
+   * @return resource instance
+   */
+  virtual resource* make_resource() = 0;
+};
+
+/**
+ * @brief A resource factory knows how to construct an instance of
+ * a specific raft::resource::resource.
+ */
+class empty_resource_factory : public resource_factory {
+ public:
+  empty_resource_factory() : resource_factory() {}
+  /**
+   * @brief Return the resource_type associated with the current factory
+   * @return resource_type corresponding to the current factory
+   */
+  resource_type get_resource_type() override { return resource_type::LAST_KEY; }
+
+  /**
+   * @brief Construct an instance of the factory's underlying resource.
+   * @return resource instance
+   */
+  resource* make_resource() override { return &res; }
+
+ private:
+  empty_resource res;
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/sub_comms.hpp b/cpp/include/raft/core/resource/sub_comms.hpp
new file mode 100644
index 0000000000..7070b61c54
--- /dev/null
+++ b/cpp/include/raft/core/resource/sub_comms.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/comms.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::resource {
+class sub_comms_resource : public resource {
+ public:
+  sub_comms_resource() : communicators_() {}
+  void* get_resource() override { return &communicators_; }
+
+  ~sub_comms_resource() override {}
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> communicators_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class sub_comms_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::SUB_COMMUNICATOR; }
+  resource* make_resource() override { return new sub_comms_resource(); }
+};
+
+/**
+ * @defgroup resource_subcomms Subcommunicator resource functions
+ * @{
+ */
+
+inline const comms::comms_t& get_subcomm(const resources& res, std::string key)
+{
+  if (!res.has_resource_factory(resource_type::SUB_COMMUNICATOR)) {
+    res.add_resource_factory(std::make_shared<sub_comms_resource_factory>());
+  }
+
+  auto sub_comms =
+    res.get_resource<std::unordered_map<std::string, std::shared_ptr<comms::comms_t>>>(
+      resource_type::SUB_COMMUNICATOR);
+  auto sub_comm = sub_comms->at(key);
+  RAFT_EXPECTS(nullptr != sub_comm.get(), "ERROR: Subcommunicator was not initialized");
+
+  return *sub_comm;
+}
+
+inline void set_subcomm(resources const& res,
+                        std::string key,
+                        std::shared_ptr<comms::comms_t> subcomm)
+{
+  if (!res.has_resource_factory(resource_type::SUB_COMMUNICATOR)) {
+    res.add_resource_factory(std::make_shared<sub_comms_resource_factory>());
+  }
+  auto sub_comms =
+    res.get_resource<std::unordered_map<std::string, std::shared_ptr<comms::comms_t>>>(
+      resource_type::SUB_COMMUNICATOR);
+  sub_comms->insert(std::make_pair(key, subcomm));
+}
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resource/thrust_policy.hpp b/cpp/include/raft/core/resource/thrust_policy.hpp
new file mode 100644
index 0000000000..1e7441e5e4
--- /dev/null
+++ b/cpp/include/raft/core/resource/thrust_policy.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+#include <rmm/exec_policy.hpp>
+namespace raft::resource {
+class thrust_policy_resource : public resource {
+ public:
+  thrust_policy_resource(rmm::cuda_stream_view stream_view)
+    : thrust_policy_(std::make_unique<rmm::exec_policy>(stream_view))
+  {
+  }
+  void* get_resource() override { return thrust_policy_.get(); }
+
+  ~thrust_policy_resource() override {}
+
+ private:
+  std::unique_ptr<rmm::exec_policy> thrust_policy_;
+};
+
+/**
+ * Factory that knows how to construct a
+ * specific raft::resource to populate
+ * the res_t.
+ */
+class thrust_policy_resource_factory : public resource_factory {
+ public:
+  thrust_policy_resource_factory(rmm::cuda_stream_view stream_view) : stream_view_(stream_view) {}
+  resource_type get_resource_type() override { return resource_type::THRUST_POLICY; }
+  resource* make_resource() override { return new thrust_policy_resource(stream_view_); }
+
+ private:
+  rmm::cuda_stream_view stream_view_;
+};
+
+/**
+ * @defgroup resource_thrust_policy Thrust policy resource functions
+ * @{
+ */
+
+/**
+ * Load a thrust policy from a res (and populate it on the res if needed).
+ * @param res raft res object for managing resources
+ * @return thrust execution policy
+ */
+inline rmm::exec_policy& get_thrust_policy(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::THRUST_POLICY)) {
+    rmm::cuda_stream_view stream = get_cuda_stream(res);
+    res.add_resource_factory(std::make_shared<thrust_policy_resource_factory>(stream));
+  }
+  return *res.get_resource<rmm::exec_policy>(resource_type::THRUST_POLICY);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
\ No newline at end of file
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
new file mode 100644
index 0000000000..64e281e934
--- /dev/null
+++ b/cpp/include/raft/core/resources.hpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "resource/resource_types.hpp"
+#include <algorithm>
+#include <mutex>
+#include <raft/core/logger.hpp>
+#include <string>
+#include <vector>
+
+namespace raft {
+
+/**
+ * @brief Resource container which allows lazy-loading and registration
+ * of resource_factory implementations, which in turn generate resource instances.
+ *
+ * This class is intended to be agnostic of the resources it contains and
+ * does not, itself, differentiate between host and device resources. Downstream
+ * accessor functions can then register and load resources as needed in order
+ * to keep its usage somewhat opaque to end-users.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <raft/core/resource/cuda_stream.hpp>
+ * #include <raft/core/resource/cublas_handle.hpp>
+ *
+ * raft::resources res;
+ * auto stream = raft::resource::get_cuda_stream(res);
+ * auto cublas_handle = raft::resource::get_cublas_handle(res);
+ * @endcode
+ */
+class resources {
+ public:
+  template <typename T>
+  using pair_res = std::pair<resource::resource_type, std::shared_ptr<T>>;
+
+  using pair_res_factory = pair_res<resource::resource_factory>;
+  using pair_resource    = pair_res<resource::resource>;
+
+  resources()
+    : factories_(resource::resource_type::LAST_KEY), resources_(resource::resource_type::LAST_KEY)
+  {
+    for (int i = 0; i < resource::resource_type::LAST_KEY; ++i) {
+      factories_.at(i) = std::make_pair(resource::resource_type::LAST_KEY,
+                                        std::make_shared<resource::empty_resource_factory>());
+      resources_.at(i) = std::make_pair(resource::resource_type::LAST_KEY,
+                                        std::make_shared<resource::empty_resource>());
+    }
+  }
+
+  /**
+   * @brief Shallow copy of underlying resources instance.
+   * Note that this does not create any new resources.
+   */
+  resources(const resources& res) : factories_(res.factories_), resources_(res.resources_) {}
+  resources(resources&&) = delete;
+  resources& operator=(resources&&) = delete;
+
+  /**
+   * @brief Returns true if a resource_factory has been registered for the
+   * given resource_type, false otherwise.
+   * @param resource_type resource type to check
+   * @return true if resource_factory is registered for the given resource_type
+   */
+  bool has_resource_factory(resource::resource_type resource_type) const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    return factories_.at(resource_type).first != resource::resource_type::LAST_KEY;
+  }
+
+  /**
+   * @brief Register a resource_factory with the current instance.
+   * This will overwrite any existing resource factories.
+   * @param factory resource factory to register on the current instance
+   */
+  void add_resource_factory(std::shared_ptr<resource::resource_factory> factory) const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+    resource::resource_type rtype = factory.get()->get_resource_type();
+    RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY,
+                 "LAST_KEY is a placeholder and not a valid resource factory type.");
+    factories_.at(rtype) = std::make_pair(rtype, factory);
+  }
+
+  /**
+   * @brief Retrieve a resource for the given resource_type and cast to given pointer type.
+   * Note that the resources are loaded lazily on-demand and resources which don't yet
+   * exist on the current instance will be created using the corresponding factory, if
+   * it exists.
+   * @tparam res_t pointer type for which retrieved resource will be casted
+   * @param resource_type resource type to retrieve
+   * @return the given resource, if it exists.
+   */
+  template <typename res_t>
+  res_t* get_resource(resource::resource_type resource_type) const
+  {
+    std::lock_guard<std::mutex> _(mutex_);
+
+    if (resources_.at(resource_type).first == resource::resource_type::LAST_KEY) {
+      RAFT_EXPECTS(factories_.at(resource_type).first != resource::resource_type::LAST_KEY,
+                   "No resource factory has been registered for the given resource %d.",
+                   resource_type);
+      resource::resource_factory* factory = factories_.at(resource_type).second.get();
+      resources_.at(resource_type)        = std::make_pair(
+        resource_type, std::shared_ptr<resource::resource>(factory->make_resource()));
+    }
+
+    resource::resource* res = resources_.at(resource_type).second.get();
+    return reinterpret_cast<res_t*>(res->get_resource());
+  }
+
+ protected:
+  mutable std::mutex mutex_;
+  mutable std::vector<pair_res_factory> factories_;
+  mutable std::vector<pair_resource> resources_;
+};
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index 90ed3940e1..f17a26dc4b 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,19 +73,15 @@ static void canberraImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::myAbs(x - y);
-    const auto add  = raft::myAbs(x) + raft::myAbs(y);
+    const auto diff = raft::abs(x - y);
+    const auto add  = raft::abs(x) + raft::abs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = raft::void_op();
 
   if (isRowMajor) {
     auto canberraRowMajor = pairwiseDistanceMatKernel<false,
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
index 454ee8c8bb..43b36e7921 100644
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #pragma once
+#include <raft/core/operators.hpp>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
@@ -72,16 +73,12 @@ static void chebyshevImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::myAbs(x - y);
-    acc             = raft::myMax(acc, diff);
+    const auto diff = raft::abs(x - y);
+    acc             = raft::max(acc, diff);
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = raft::void_op();
 
   if (isRowMajor) {
     auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
diff --git a/cpp/include/raft/distance/detail/compress_to_bits.cuh b/cpp/include/raft/distance/detail/compress_to_bits.cuh
new file mode 100644
index 0000000000..e36b7ce707
--- /dev/null
+++ b/cpp/include/raft/distance/detail/compress_to_bits.cuh
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+
+namespace raft::distance::detail {
+
+/**
+ * @brief Compress 2D boolean matrix to bitfield
+ *
+ * Utility kernel for maskedL2NN.
+ *
+ * @tparam T
+ *
+ * @parameter[in]  in       An `m x n` boolean matrix. Row major.
+ * @parameter[out] out      An `(m / bits_per_elem) x n` matrix with elements of
+ *                          type T, where T is of size `bits_per_elem` bits.
+ *                          Note: the division (`/`) is a ceilDiv.
+ */
+template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
+__global__ void compress_to_bits_kernel(
+  raft::device_matrix_view<const bool, int, raft::layout_c_contiguous> in,
+  raft::device_matrix_view<T, int, raft::layout_c_contiguous> out)
+{
+  constexpr int bits_per_element = 8 * sizeof(T);
+  constexpr int tile_dim_m       = bits_per_element;
+  constexpr int nthreads         = 128;
+  constexpr int tile_dim_n       = nthreads;  // read 128 bools at once = 1 sector
+
+  // Tile in shared memory is transposed
+  __shared__ bool smem[tile_dim_n][tile_dim_m];
+
+  const int num_tiles_per_m = raft::ceildiv(in.extent(0), tile_dim_m);
+  const int num_tiles_per_n = raft::ceildiv(in.extent(1), tile_dim_n);
+
+  for (int lin_tile_idx = blockIdx.x; true; lin_tile_idx += gridDim.x) {
+    const int tile_idx_n = tile_dim_n * (lin_tile_idx % num_tiles_per_n);
+    const int tile_idx_m = tile_dim_m * (lin_tile_idx / num_tiles_per_n);
+
+    if (in.extent(0) <= tile_idx_m) { break; }
+    // Fill shared memory tile
+    bool reg_buf[tile_dim_m];
+#pragma unroll
+    for (int i = 0; i < tile_dim_m; ++i) {
+      const int in_m       = tile_idx_m + i;
+      const int in_n       = tile_idx_n + threadIdx.x;
+      bool in_bounds       = in_m < in.extent(0) && in_n < in.extent(1);
+      reg_buf[i]           = in_bounds ? in(in_m, in_n) : false;
+      smem[threadIdx.x][i] = reg_buf[i];
+    }
+    __syncthreads();
+
+    // Drain memory tile into single output element out_elem.
+    T out_elem{0};
+#pragma unroll
+    for (int j = 0; j < tile_dim_n; ++j) {
+      if (smem[threadIdx.x][j]) { out_elem |= T(1) << j; }
+    }
+    __syncthreads();
+
+    // Write output.
+    int out_m = tile_idx_m / bits_per_element;
+    int out_n = tile_idx_n + threadIdx.x;
+
+    if (out_m < out.extent(0) && out_n < out.extent(1)) { out(out_m, out_n) = out_elem; }
+  }
+}
+
+/**
+ * @brief Compress 2D boolean matrix to bitfield
+ *
+ * Utility kernel for maskedL2NN.
+ *
+ * @tparam T
+ *
+ * @parameter[in]  in       An `m x n` boolean matrix. Row major.
+ * @parameter[out] out      An `(m / bits_per_elem) x n` matrix with elements of
+ *                          type T, where T is of size `bits_per_elem` bits.
+ *                          Note: the division (`/`) is a ceilDiv.
+ */
+template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
+void compress_to_bits(raft::device_resources const& handle,
+                      raft::device_matrix_view<const bool, int, raft::layout_c_contiguous> in,
+                      raft::device_matrix_view<T, int, raft::layout_c_contiguous> out)
+{
+  auto stream                    = handle.get_stream();
+  constexpr int bits_per_element = 8 * sizeof(T);
+
+  RAFT_EXPECTS(raft::ceildiv(in.extent(0), bits_per_element) == out.extent(0),
+               "Number of output rows must be ceildiv(input rows, bits_per_elem)");
+  RAFT_EXPECTS(in.extent(1) == out.extent(1), "Number of output columns must equal input columns.");
+
+  const int num_SMs           = raft::getMultiProcessorCount();
+  int blocks_per_sm           = 0;
+  constexpr int num_threads   = 128;
+  constexpr int dyn_smem_size = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &blocks_per_sm, compress_to_bits_kernel<T>, num_threads, dyn_smem_size));
+
+  dim3 grid(num_SMs * blocks_per_sm);
+  dim3 block(128);
+  compress_to_bits_kernel<<<grid, block, 0, stream>>>(in, out);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 9bdbbf112c..f7fe3678e6 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,7 +125,7 @@ static void correlationImpl(const DataT* x,
         auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
         auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
 
-        acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+        acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom));
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 4184810fff..1a2db63f5c 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ struct L2ExpandedOp {
   __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
   {
     AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
-    return sqrt ? raft::mySqrt(outVal) : outVal;
+    return sqrt ? raft::sqrt(outVal) : outVal;
   }
 
   __device__ AccT operator()(DataT aData) const noexcept { return aData; }
@@ -130,7 +130,7 @@ void euclideanExpImpl(const DataT* x,
         for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
           for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-            acc[i][j] = raft::mySqrt(acc[i][j]);
+            acc[i][j] = raft::sqrt(acc[i][j]);
           }
         }
       }
@@ -350,7 +350,7 @@ void euclideanUnExpImpl(const DataT* x,
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
         for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = raft::mySqrt(acc[i][j]);
+          acc[i][j] = raft::sqrt(acc[i][j]);
         }
       }
     }
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 5311a26d19..4f5e224a19 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -39,6 +39,7 @@ template <typename LabelT, typename DataT>
 struct KVPMinReduceImpl {
   typedef raft::KeyValuePair<LabelT, DataT> KVP;
   DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
@@ -185,7 +186,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
 #pragma unroll
         for (int j = 0; j < P::AccColsPerTh; ++j) {
           auto acc_ij = acc[i][j];
-          acc[i][j]   = acc_ij > DataT{0} ? raft::mySqrt(acc_ij) : DataT{0};
+          acc[i][j]   = acc_ij > DataT{0} ? raft::sqrt(acc_ij) : DataT{0};
         }
       }
     }
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 51f462ab36..13507fe84f 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ static void hellingerImpl(const DataT* x,
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         const auto finalVal  = (1 - acc[i][j]);
         const auto rectifier = (!signbit(finalVal));
-        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
+        acc[i][j]            = raft::sqrt(rectifier * finalVal);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index 92ee071cf5..f96da01b87 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,11 +78,11 @@ static void jensenShannonImpl(const DataT* x,
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const DataT m     = 0.5f * (x + y);
     const bool m_zero = (m == 0);
-    const auto logM   = (!m_zero) * raft::myLog(m + m_zero);
+    const auto logM   = (!m_zero) * raft::log(m + m_zero);
 
     const bool x_zero = (x == 0);
     const bool y_zero = (y == 0);
-    acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero)));
+    acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
   };
 
   // epilogue operation lambda for final value calculation
@@ -95,7 +95,7 @@ static void jensenShannonImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::mySqrt(0.5 * acc[i][j]);
+        acc[i][j] = raft::sqrt(0.5 * acc[i][j]);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 4c0c4b6ace..7ebeaf4de9 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,10 +81,10 @@ static void klDivergenceImpl(const DataT* x,
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     if (isRowMajor) {
       const bool x_zero = (x == 0);
-      acc += x * (raft::myLog(x + x_zero) - y);
+      acc += x * (raft::log(x + x_zero) - y);
     } else {
       const bool y_zero = (y == 0);
-      acc += y * (raft::myLog(y + y_zero) - x);
+      acc += y * (raft::log(y + y_zero) - x);
     }
   };
 
@@ -92,23 +92,23 @@ static void klDivergenceImpl(const DataT* x,
     if (isRowMajor) {
       const bool x_zero = (x == 0);
       const bool y_zero = (y == 0);
-      acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+      acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero));
     } else {
       const bool y_zero = (y == 0);
       const bool x_zero = (x == 0);
-      acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+      acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero));
     }
   };
 
   auto unaryOp_lambda = [] __device__(DataT input) {
     const bool x_zero = (input == 0);
-    return (!x_zero) * raft::myLog(input + x_zero);
+    return (!x_zero) * raft::log(input + x_zero);
   };
 
   auto unaryOp_lambda_reverse = [] __device__(DataT input) {
     // reverse previous log (x) back to x using (e ^ log(x))
     const bool x_zero = (input == 0);
-    return (!x_zero) * raft::myExp(input);
+    return (!x_zero) * raft::exp(input);
   };
 
   // epilogue operation lambda for final value calculation
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 95514db60b..bf10651b60 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,16 +71,12 @@ static void l1Impl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::myAbs(x - y);
+    const auto diff = raft::abs(x - y);
     acc += diff;
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = raft::void_op();
 
   if (isRowMajor) {
     auto l1RowMajor = pairwiseDistanceMatKernel<false,
diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
new file mode 100644
index 0000000000..6d4e3f40a6
--- /dev/null
+++ b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/linalg/contractions.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <cstddef>
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+/**
+ * @brief Device class for masked nearest neighbor computations.
+ *
+ * @tparam useNorms       whether norms are needed
+ * @tparam DataT          input data-type (for x and y matrices)
+ * @tparam AccT           accumulation data-type
+  * @tparam IdxT           index data-type
+ * @tparam Policy         struct which tunes the Contraction kernel
+ * @tparam CoreLambda     tells how to accumulate an x and y into
+                          acc. its signature:
+    template <typename AccT, typename DataT> void core_lambda(AccT& acc,
+      const DataT& x, const DataT& y)
+ * @tparam EpilogueLambda applies an elementwise function to compute final
+    values. Its signature is:
+    template <typename AccT, typename DataT> void epilogue_lambda
+    (AccT acc[][], DataT* regxn, DataT* regyn);
+ * @tparam FinalLambda the final lambda called on final distance value
+ * @tparam rowEpilogueLambda epilog lambda that executes when a full row has
+ * been processed.
+ *
+ * @param[in] x input matrix
+ * @param[in] y input matrix
+ * @param[in] m number of rows of x
+ * @param[in] n number of columns of y
+ * @param[in] k number of cols of x and y
+ * @param[in] lda leading dimension of x
+ * @param[in] ldb leading dimension of y
+ * @param[in] ldd parameter to keep Contractions_NT happy..
+ * @param[in] xn row norms of input matrix A. Required for expanded L2, cosine
+ * @param[in] yn row norms of input matrix B. Required for expanded L2, cosine
+ * @param[in]  adj           An adjacency matrix encoded as a bitfield indicating for each
+ *                           row of `x` and each group in `y` whether to compute the
+ *                           distance. Dim = `(m / 64) x num_groups`.
+ * @param[in]  group_idxs    An array containing the *end* indices of each group
+ *                           in `y`. The value of group_idxs[j] indicates the
+ *                           start of group j + 1, i.e., it is the inclusive
+ *                           scan of the group lengths. The first group is
+ *                           always assumed to start at index 0 and the last
+ *                           group typically ends at index `n`. Length =
+ *                           `num_groups`.
+ * @param[in] num_groups     The number of groups in group_idxs.
+ * @param[in] smem shared mem buffer for intermediate storage of x, y, xn & yn.
+ * @param core_op the core accumulation operation lambda
+ * @param epilog_op the epilog operation lambda
+ * @param fin_op the final gemm epilogue lambda
+ * @param rowEpilog_op epilog lambda that executes when a full row has been processed.
+ */
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          typename rowEpilogueLambda,
+          bool isRowMajor    = true,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
+struct MaskedDistances : public BaseClass {
+ private:
+  typedef Policy P;
+  const DataT* xn;
+  const DataT* yn;
+  const DataT* const yBase;
+  const uint64_t* adj;
+  const IdxT* group_idxs;
+  IdxT num_groups;
+  char* smem;
+  CoreLambda core_op;
+  EpilogueLambda epilog_op;
+  FinalLambda fin_op;
+  rowEpilogueLambda rowEpilog_op;
+
+  AccT acc[P::AccRowsPerTh][P::AccColsPerTh];
+
+ public:
+  // Constructor
+  DI MaskedDistances(const DataT* _x,
+                     const DataT* _y,
+                     IdxT _m,
+                     IdxT _n,
+                     IdxT _k,
+                     IdxT _lda,
+                     IdxT _ldb,
+                     IdxT _ldd,
+                     const DataT* _xn,
+                     const DataT* _yn,
+                     const uint64_t* _adj,
+                     const IdxT* _group_idxs,
+                     IdxT _num_groups,
+                     char* _smem,
+                     CoreLambda _core_op,
+                     EpilogueLambda _epilog_op,
+                     FinalLambda _fin_op,
+                     rowEpilogueLambda _rowEpilog_op)
+    : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem),
+      xn(_xn),
+      yn(_yn),
+      yBase(_y),
+      adj(_adj),
+      group_idxs(_group_idxs),
+      num_groups(_num_groups),
+      smem(_smem),
+      core_op(_core_op),
+      epilog_op(_epilog_op),
+      fin_op(_fin_op),
+      rowEpilog_op(_rowEpilog_op)
+  {
+  }
+
+  DI void run()
+  {
+    const auto grid_stride_m = (P::Mblk * gridDim.y);
+    const auto grid_offset_m = (P::Mblk * blockIdx.y);
+
+    const auto grid_stride_g = gridDim.x;
+    const auto grid_offset_g = blockIdx.x;
+
+    for (auto tile_idx_m = grid_offset_m; tile_idx_m < this->m; tile_idx_m += grid_stride_m) {
+      // Start loop over groups
+      for (auto idx_g = grid_offset_g; idx_g < this->num_groups; idx_g += grid_stride_g) {
+        const uint64_t block_adj = get_block_adjacency(adj, tile_idx_m, idx_g);
+        // block_adj is a bitfield that contains a 1 if a row is adjacent to the
+        // current group. All zero means we can skip this group.
+        if (block_adj == 0) { continue; }
+
+        // thread_adj is a bitfield that contains a 1 at location i iff we must
+        // compute row i of acc (the accumulator register tile). That is,
+        // for i = 0,.., AccRowsPerTh and j = 0,.., AccColsPerTh:
+        //
+        //   ((1 << i) & thread_adj) > 0 <=> acc[i][j] must be computed.
+        //
+        // We precompute this information because it is used in various
+        // locations to skip thread-local computations, specifically:
+        //
+        // 1. To skip computations if thread_adj == 0, i.e., none of the values
+        //    of `acc` have to be computed.
+        //
+        // 2. In epilog_op, to consider only values of `acc` to be reduced that
+        //    are not masked of.
+        //
+        // Note 1: Even when the computation can be skipped for a specific thread,
+        // the thread still participates in synchronization operations.
+        //
+        // Note 2: In theory, it should be possible to skip computations for
+        // specific rows of `acc`. In practice, however, this does not improve
+        // performance.
+        int thread_adj = compute_thread_adjacency(block_adj);
+
+        auto tile_idx_n        = idx_g == 0 ? 0 : group_idxs[idx_g - 1];
+        const auto group_end_n = group_idxs[idx_g];
+        for (; tile_idx_n < group_end_n; tile_idx_n += P::Nblk) {
+          // We provide group_end_n to limit the number of unnecessary data
+          // points that are loaded from y.
+          this->ldgXY(tile_idx_m, tile_idx_n, 0, group_end_n);
+
+          reset_accumulator();
+          this->stsXY();
+          __syncthreads();
+          this->switch_write_buffer();
+
+          for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+            this->ldgXY(tile_idx_m, tile_idx_n, kidx, group_end_n);
+            // Process all data in shared memory (previous k-block) and
+            // accumulate in registers.
+            if (thread_adj != 0) { accumulate(); }
+            this->stsXY();
+            __syncthreads();
+            this->switch_write_buffer();
+            this->switch_read_buffer();
+          }
+          if (thread_adj != 0) {
+            accumulate();  // last iteration
+          }
+          // The pre-condition for the loop over tile_idx_n is that write_buffer
+          // and read_buffer point to the same buffer. This flips read_buffer
+          // back so that it satisfies the pre-condition of this loop.
+          this->switch_read_buffer();
+
+          if (useNorms) {
+            DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh];
+            load_norms(tile_idx_m, tile_idx_n, group_end_n, regxn, regyn);
+            if (thread_adj != 0) {
+              epilog_op(acc, thread_adj, regxn, regyn, tile_idx_n, tile_idx_m, group_end_n);
+            }
+          } else {
+            if (thread_adj != 0) {
+              epilog_op(acc, thread_adj, nullptr, nullptr, tile_idx_n, tile_idx_m, group_end_n);
+            }
+          }
+        }  // tile_idx_n
+      }    // idx_g
+      rowEpilog_op(tile_idx_m);
+    }  // tile_idx_m
+  }
+
+ private:
+  DI uint64_t get_block_adjacency(const uint64_t* adj, IdxT tile_idx_m, IdxT idx_group)
+  {
+    // A single element of `adj` contains exactly enough bits to indicate which
+    // rows in the current tile to skip and which to compute.
+    static_assert(P::Mblk == 8 * sizeof(adj[0]),
+                  "maskedL2NN only supports a policy with 64 rows per block.");
+    IdxT block_flag_idx = tile_idx_m / P::Mblk;
+    // Index into adj at row tile_idx_m / 64 and column idx_group.
+    return adj[block_flag_idx * this->num_groups + idx_group];
+  }
+
+  DI uint32_t compute_thread_adjacency(const uint64_t block_adj)
+  {
+    // thread_adj is a bitfield that contains a 1 at location i iff we must
+    // compute row i of acc (the accumulator register tile). It is described in
+    // more detail in the run() method.
+    uint32_t thread_adj = 0;
+#pragma unroll
+    for (int thread_row_idx = 0; thread_row_idx < P::AccRowsPerTh; ++thread_row_idx) {
+      // Index `thread_row_idx` refers to a row of the current threads' register
+      // tile `acc`, i.e., acc[i][:]. Index `block_row_idx` refers to the
+      // corresponding row of the current block tile in shared memory.
+      const int block_row_idx = this->accrowid + thread_row_idx * P::AccThRows;
+
+      // block_row_is_adjacent is true if the current block_row_idx is adjacent
+      // to the current group.
+      const uint64_t block_mask        = 1ull << block_row_idx;
+      const bool block_row_is_adjacent = (block_adj & block_mask) != 0;
+      if (block_row_is_adjacent) {
+        // If block row is adjacent, write a 1 bit to thread_adj at location
+        // `thread_row_idx`.
+        const uint32_t thread_mask = 1 << thread_row_idx;
+        thread_adj |= thread_mask;
+      }
+    }
+    return thread_adj;
+  }
+
+  DI void reset_accumulator()
+  {
+    // Reset accumulator registers to zero.
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        acc[i][j] = BaseClass::Zero;
+      }
+    }
+  }
+
+  DI void accumulate()
+  {
+#pragma unroll
+    for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
+      this->ldsXY(ki);
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < P::AccColsPerTh; ++j) {
+#pragma unroll
+          for (int v = 0; v < P::Veclen; ++v) {
+            core_op(acc[i][j], this->regx[i][v], this->regy[j][v]);
+          }
+        }
+      }
+    }
+  }
+
+  DI void load_norms(IdxT tile_idx_m,
+                     IdxT tile_idx_n,
+                     IdxT end_n,
+                     DataT (&regxn)[P::AccRowsPerTh],
+                     DataT (&regyn)[P::AccColsPerTh])
+  {
+    DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
+    DataT* syNorm = (&sxNorm[P::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
+      auto idx  = tile_idx_m + i;
+      sxNorm[i] = idx < this->m ? xn[idx] : 0;
+    }
+
+    for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
+      auto idx  = tile_idx_n + i;
+      syNorm[i] = idx < end_n ? yn[idx] : 0;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)];
+    }
+#pragma unroll
+    for (int i = 0; i < P::AccColsPerTh; ++i) {
+      regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)];
+    }
+  }
+};  // struct MaskedDistances
+
+};  // namespace detail
+};  // namespace distance
+};  // namespace raft
diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh
new file mode 100644
index 0000000000..1c92de16fc
--- /dev/null
+++ b/cpp/include/raft/distance/detail/masked_nn.cuh
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <limits>
+#include <stdint.h>
+
+#include <raft/distance/detail/compress_to_bits.cuh>
+#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/distance/detail/masked_distance_base.cuh>
+#include <raft/linalg/contractions.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename P,
+          typename ReduceOpT,
+          typename KVPReduceOpT,
+          typename CoreLambda,
+          typename FinalLambda>
+__global__ __launch_bounds__(P::Nthreads, 2) void maskedL2NNkernel(OutT* min,
+                                                                   const DataT* x,
+                                                                   const DataT* y,
+                                                                   const DataT* xn,
+                                                                   const DataT* yn,
+                                                                   const uint64_t* adj,
+                                                                   const IdxT* group_idxs,
+                                                                   IdxT num_groups,
+                                                                   IdxT m,
+                                                                   IdxT n,
+                                                                   IdxT k,
+                                                                   bool sqrt,
+                                                                   DataT maxVal,
+                                                                   int* mutex,
+                                                                   ReduceOpT redOp,
+                                                                   KVPReduceOpT pairRedOp,
+                                                                   CoreLambda core_op,
+                                                                   FinalLambda fin_op)
+{
+  extern __shared__ char smem[];
+
+  typedef raft::KeyValuePair<IdxT, DataT> KVPair;
+  KVPair val[P::AccRowsPerTh];
+#pragma unroll
+  for (int i = 0; i < P::AccRowsPerTh; ++i) {
+    val[i] = {-1, maxVal};
+  }
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [pairRedOp, &val, maxVal, sqrt] __device__(
+                         DataT acc[P::AccRowsPerTh][P::AccColsPerTh],
+                         int thread_adj,
+                         DataT* regxn,
+                         DataT* regyn,
+                         IdxT tile_idx_n,
+                         IdxT tile_idx_m,
+                         IdxT tile_end_n) {
+    KVPReduceOpT pairRed_op(pairRedOp);
+
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
+      }
+    }
+    if (sqrt) {
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < P::AccColsPerTh; ++j) {
+          acc[i][j] = raft::sqrt(acc[i][j]);
+        }
+      }
+    }
+
+    // intra thread reduce
+    const auto acccolid = threadIdx.x % P::AccThCols;
+    const auto accrowid = threadIdx.x / P::AccThCols;
+
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      // thread_adj is a bitfield that contains a 1 at location i iff we must
+      // compute row i of acc (the accumulator register tile). It is described in
+      // more detail in the maskedDistances.run() method.
+      const bool ignore = (thread_adj & (1 << i)) == 0;
+      if (ignore) { continue; }
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto tmpkey = acccolid + j * P::AccThCols + tile_idx_n;
+        if (tile_end_n <= tmpkey) {
+          // Do not process beyond end of tile.
+          continue;
+        }
+        KVPair tmp = {tmpkey, acc[i][j]};
+        if (tmpkey < tile_end_n) {
+          val[i] = pairRed_op(accrowid + i * P::AccThRows + tile_idx_m, tmp, val[i]);
+        }
+      }
+    }
+  };
+
+  auto rowEpilog_lambda =
+    [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT tile_idx_m) {
+      KVPReduceOpT pairRed_op(pairRedOp);
+      ReduceOpT red_op(redOp);
+
+      const auto accrowid = threadIdx.x / P::AccThCols;
+      const auto lid      = raft::laneId();
+    // reduce
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+          auto tmpkey   = raft::shfl(val[i].key, lid + j);
+          auto tmpvalue = raft::shfl(val[i].value, lid + j);
+          KVPair tmp    = {tmpkey, tmpvalue};
+          val[i]        = pairRed_op(accrowid + i * P::AccThRows + tile_idx_m, tmp, val[i]);
+        }
+      }
+
+      updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op, m, tile_idx_m);
+
+    // reset the val array.
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        val[i] = {-1, maxVal};
+      }
+    };
+
+  IdxT lda = k, ldb = k, ldd = n;
+  MaskedDistances<true,
+                  DataT,
+                  DataT,
+                  IdxT,
+                  P,
+                  CoreLambda,
+                  decltype(epilog_lambda),
+                  FinalLambda,
+                  decltype(rowEpilog_lambda),
+                  true>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        xn,
+        yn,
+        adj,
+        group_idxs,
+        num_groups,
+        smem,
+        core_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
+  obj.run();
+}
+
+/**
+ * @brief Wrapper for maskedL2NNkernel
+ *
+ * Responsibilities:
+ * - Allocate (and initialize) workspace memory for:
+ *   - mutexes used in nearest neighbor update step
+ *   - adjacency matrix bitfield
+ * - Compress adjacency matrix to bitfield
+ * - Initialize output buffer (conditional on `initOutBuffer`)
+ * - Specify core and final operations for the L2 norm
+ * - Determine optimal launch configuration for kernel.
+ * - Launch kernel and check for errors.
+ *
+ * @tparam DataT         Input data-type (for x and y matrices).
+ * @tparam OutT          Output data-type (for key-value pairs).
+ * @tparam IdxT          Index data-type.
+ * @tparam ReduceOpT     A struct to perform the final needed reduction
+ *                       operation and also to initialize the output array
+ *                       elements with the appropriate initial value needed for
+ *                       reduction.
+ * @tparam KVPReduceOpT  Type of Reduction operation on key value pairs.
+ *
+ * @param      handle            RAFT handle for managing expensive resources
+ * @param[out] out               Will contain reduced output (nn key-value pairs)
+ * @param[in]  x                 First matrix. Row major. Dim = `m x k`. (on device)
+ * @param[in]  y                 Second matrix. Row major. Dim = `n x k`. (on device)
+ * @param[in]  xn                L2 squared norm of `x`. Length = `m`.
+ * @param[in]  yn                L2 squared norm of `y`. Length = `n`.
+ * @param[in]  adj           A boolean adjacency matrix indicating for each
+ *                           row of `x` and each group in `y` whether to compute the
+ *                           distance. Dim = `m x num_groups`.
+ * @param[in]  group_idxs    An array containing the *end* indices of each group
+ *                           in `y`. The value of group_idxs[j] indicates the
+ *                           start of group j + 1, i.e., it is the inclusive
+ *                           scan of the group lengths. The first group is
+ *                           always assumed to start at index 0 and the last
+ *                           group typically ends at index `n`. Length =
+ *                           `num_groups`.
+ * @param[in]  num_groups    Length of `group_idxs`.
+ * @param      m             Rows of `x`.
+ * @param      n             Rows of `y`.
+ * @param      k             Cols of `x` and `y`.
+ * @param      redOp         Reduction operator in the epilogue
+ * @param      pairRedOp     Reduction operation on key value pairs
+ * @param      sqrt          Whether to compute the squared or actual (i.e. sqrt) L2 norm.
+ * @param      initOutBuffer Whether to initialize the output buffer
+ *
+ *
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void maskedL2NNImpl(raft::device_resources const& handle,
+                    OutT* out,
+                    const DataT* x,
+                    const DataT* y,
+                    const DataT* xn,
+                    const DataT* yn,
+                    const bool* adj,
+                    const IdxT* group_idxs,
+                    IdxT num_groups,
+                    IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    ReduceOpT redOp,
+                    KVPReduceOpT pairRedOp,
+                    bool sqrt,
+                    bool initOutBuffer)
+{
+  typedef typename linalg::Policy4x4<DataT, 1>::Policy P;
+
+  static_assert(P::Mblk == 64, "maskedL2NNImpl only supports a policy with 64 rows per block.");
+
+  // Get stream and workspace memory resource
+  rmm::mr::device_memory_resource* ws_mr =
+    dynamic_cast<rmm::mr::device_memory_resource*>(handle.get_workspace_resource());
+  auto stream = handle.get_stream();
+
+  // Acquire temporary buffers and initialize to zero:
+  // 1) Adjacency matrix bitfield
+  // 2) Workspace for fused nearest neighbor operation
+  size_t m_div_64 = raft::ceildiv(m, IdxT(64));
+  rmm::device_uvector<uint64_t> ws_adj64{m_div_64 * num_groups, stream, ws_mr};
+  rmm::device_uvector<int> ws_fused_nn{size_t(m), stream, ws_mr};
+  RAFT_CUDA_TRY(cudaMemsetAsync(ws_adj64.data(), 0, ws_adj64.size() * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ws_fused_nn.data(), 0, ws_fused_nn.size() * sizeof(int), stream));
+
+  // Compress boolean adjacency matrix to bitfield.
+  auto adj_view = raft::make_device_matrix_view<const bool, int>(adj, m, num_groups);
+  auto adj64_view =
+    raft::make_device_matrix_view<uint64_t, int>(ws_adj64.data(), m_div_64, num_groups);
+  compress_to_bits(handle, adj_view, adj64_view);
+
+  // Initialize output buffer with keyvalue pairs as determined by the reduction
+  // operator (it will be called with maxVal).
+  constexpr auto maxVal = std::numeric_limits<DataT>::max();
+  if (initOutBuffer) {
+    dim3 grid(raft::ceildiv<int>(m, P::Nthreads));
+    dim3 block(P::Nthreads);
+
+    initKernel<DataT, OutT, IdxT, ReduceOpT><<<grid, block, 0, stream>>>(out, m, maxVal, redOp);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
+  auto fin_op      = raft::identity_op{};
+
+  auto kernel               = maskedL2NNkernel<DataT,
+                                 OutT,
+                                 IdxT,
+                                 P,
+                                 ReduceOpT,
+                                 KVPReduceOpT,
+                                 decltype(core_lambda),
+                                 decltype(fin_op)>;
+  constexpr size_t smemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+  dim3 block(P::Nthreads);
+  dim3 grid = launchConfigGenerator<P>(m, n, smemSize, kernel);
+
+  kernel<<<grid, block, smemSize, stream>>>(out,
+                                            x,
+                                            y,
+                                            xn,
+                                            yn,
+                                            ws_adj64.data(),
+                                            group_idxs,
+                                            num_groups,
+                                            m,
+                                            n,
+                                            k,
+                                            sqrt,
+                                            maxVal,
+                                            ws_fused_nn.data(),
+                                            redOp,
+                                            pairRedOp,
+                                            core_lambda,
+                                            fin_op);
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index bda83babf1..42af8cd281 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,8 +74,8 @@ void minkowskiUnExpImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::myAbs(x - y);
-    acc += raft::myPow(diff, p);
+    const auto diff = raft::abs(x - y);
+    acc += raft::pow(diff, p);
   };
 
   // epilogue operation lambda for final value calculation
@@ -89,7 +89,7 @@ void minkowskiUnExpImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::myPow(acc[i][j], one_over_p);
+        acc[i][j] = raft::pow(acc[i][j], one_over_p);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 69bb83d29a..d849b23999 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,6 +59,7 @@ namespace detail {
  * @param core_op the core accumulation operation lambda
  * @param epilog_op the epilog operation lambda
  * @param fin_op the final gemm epilogue lambda
+ * @param rowEpilog_op epilog lambda that executes when a full row has been processed
  */
 
 template <bool useNorms,
@@ -87,6 +88,11 @@ struct PairwiseDistances : public BaseClass {
   FinalLambda fin_op;
   rowEpilogueLambda rowEpilog_op;
 
+  const IdxT grid_stride_m;
+  const IdxT grid_stride_n;
+  const IdxT grid_offset_m;
+  const IdxT grid_offset_n;
+
   AccT acc[P::AccRowsPerTh][P::AccColsPerTh];
 
  public:
@@ -116,67 +122,76 @@ struct PairwiseDistances : public BaseClass {
       core_op(_core_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
-      rowEpilog_op(_rowEpilog_op)
+      rowEpilog_op(_rowEpilog_op),
+      grid_stride_m(P::Mblk * gridDim.y),
+      grid_stride_n(P::Nblk * gridDim.x),
+      grid_offset_m(P::Mblk * blockIdx.y),
+      grid_offset_n(P::Nblk * blockIdx.x)
   {
   }
 
   DI void run()
   {
-    for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m;
-         gridStrideY += P::Mblk * gridDim.y) {
-      for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n;
-           gridStrideX += P::Nblk * gridDim.x) {
-        prolog(gridStrideX, gridStrideY);
-        loop();
-        epilog(gridStrideX, gridStrideY);
+    for (auto tile_idx_m = grid_offset_m; tile_idx_m < this->m; tile_idx_m += grid_stride_m) {
+      this->ldgXY(tile_idx_m, grid_offset_n, 0);
+      for (auto tile_idx_n = grid_offset_n; tile_idx_n < this->n; tile_idx_n += grid_stride_n) {
+        // Prolog:
+        reset_accumulator();
+        this->stsXY();
+        __syncthreads();
+        this->switch_write_buffer();
+
+        // Main loop:
+        for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+          this->ldgXY(tile_idx_m, tile_idx_n, kidx);
+          // Process all data in shared memory (previous k-block) and
+          // accumulate in registers.
+          accumulate();
+          this->stsXY();
+          __syncthreads();
+          this->switch_write_buffer();
+          this->switch_read_buffer();
+        }
+        accumulate();  // last iteration
+        // The pre-condition for the loop over tile_idx_n is that write_buffer
+        // and read_buffer point to the same buffer. This flips read_buffer back
+        // so that it satisfies the pre-condition of this loop.
+        this->switch_read_buffer();
+
+        // Epilog:
+        if (useNorms) {
+          DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh];
+          load_norms(tile_idx_m, tile_idx_n, regxn, regyn);
+          // Overlap ldg with epilog computation
+          ldgNextGridStride(tile_idx_m, tile_idx_n);
+          epilog_op(acc, regxn, regyn, tile_idx_n, tile_idx_m);
+        } else {
+          // Overlap ldg with epilog computation
+          ldgNextGridStride(tile_idx_m, tile_idx_n);
+          epilog_op(acc, nullptr, nullptr, tile_idx_n, tile_idx_m);
+        }
+        if (writeOut) { store_output(tile_idx_m, tile_idx_n); }
       }
-      rowEpilog_op(gridStrideY);
+      rowEpilog_op(tile_idx_m);
     }
   }
 
  private:
-  DI void updateIndicesY()
-  {
-    const auto stride = P::Nblk * gridDim.x;
-    if (isRowMajor) {
-      this->y += stride * this->ldb;
-    } else {
-      this->y += stride;
-    }
-    this->yrowid += stride;
-  }
-
-  DI void updateIndicesXY()
-  {
-    const auto stride = P::Mblk * gridDim.y;
-    if (isRowMajor) {
-      this->x += stride * this->lda;
-      this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid;
-      this->y      = yBase + this->yrowid * this->ldb;
-    } else {
-      this->x += stride;
-      this->yrowid = IdxT(blockIdx.x) * P::Nblk;
-      this->y      = yBase + this->yrowid + this->srowid * this->ldb;
-    }
-    this->xrowid += stride;
-  }
-
-  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
+  DI void ldgNextGridStride(IdxT tile_idx_m, IdxT tile_idx_n)
   {
     // Fetch next grid stride ldg if within range
-    if ((gridStrideX + gridDim.x * P::Nblk) < this->n) {
-      updateIndicesY();
-      this->ldgXY(0);
-    } else if ((gridStrideY + gridDim.y * P::Mblk) < this->m) {
-      updateIndicesXY();
-      this->ldgXY(0);
+    const auto next_tile_tile_idx_n = tile_idx_n + grid_stride_n;
+    const auto next_tile_tile_idx_m = tile_idx_m + grid_stride_m;
+    if ((next_tile_tile_idx_n) < this->n) {
+      this->ldgXY(tile_idx_m, next_tile_tile_idx_n, 0);
+    } else if ((next_tile_tile_idx_m) < this->m) {
+      this->ldgXY(next_tile_tile_idx_m, grid_offset_n, 0);
     }
   }
 
-  DI void prolog(IdxT gridStrideX, IdxT gridStrideY)
+  DI void reset_accumulator()
   {
-    if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); }
-
+    // Reset accumulator registers to zero.
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -184,28 +199,6 @@ struct PairwiseDistances : public BaseClass {
         acc[i][j] = BaseClass::Zero;
       }
     }
-
-    this->stsXY();
-    __syncthreads();
-    this->pageWr ^= 1;
-  }
-
-  DI void loop()
-  {
-    for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-      this->ldgXY(kidx);
-      accumulate();  // on the previous k-block
-      this->stsXY();
-      __syncthreads();
-      this->pageWr ^= 1;
-      this->pageRd ^= 1;
-    }
-    accumulate();  // last iteration
-    // This is needed for making sure next grid stride of
-    // non-norm based metrics uses previously accumulated buffer so
-    // it doesn't make shmem dirty until previous iteration
-    // is complete.
-    this->pageRd ^= 1;
   }
 
   DI void accumulate()
@@ -226,60 +219,52 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT gridStrideX, IdxT gridStrideY)
+  DI void load_norms(IdxT tile_idx_m,
+                     IdxT tile_idx_n,
+                     DataT (&regxn)[P::AccRowsPerTh],
+                     DataT (&regyn)[P::AccColsPerTh])
   {
-    if (useNorms) {
-      DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
-      DataT* syNorm = (&sxNorm[P::Mblk]);
-
-      // Load x & y norms required by this threadblock in shmem buffer
-      if (gridStrideX == blockIdx.x * P::Nblk) {
-        for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx  = gridStrideY + i;
-          sxNorm[i] = idx < this->m ? xn[idx] : 0;
-        }
-      }
-
-      for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx  = gridStrideX + i;
-        syNorm[i] = idx < this->n ? yn[idx] : 0;
+    DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
+    DataT* syNorm = (&sxNorm[P::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    if (tile_idx_n == blockIdx.x * P::Nblk) {
+      for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
+        auto idx  = tile_idx_m + i;
+        sxNorm[i] = idx < this->m ? xn[idx] : 0;
       }
+    }
 
-      __syncthreads();
+    for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
+      auto idx  = tile_idx_n + i;
+      syNorm[i] = idx < this->n ? yn[idx] : 0;
+    }
+    __syncthreads();
 
-      DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh];
 #pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)];
-      }
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)];
+    }
 #pragma unroll
-      for (int i = 0; i < P::AccColsPerTh; ++i) {
-        regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)];
-      }
-
-      // Overlap ldg with epilog computation
-      ldgNextGridStride(gridStrideX, gridStrideY);
-      epilog_op(acc, regxn, regyn, gridStrideX, gridStrideY);
-    } else {
-      // Overlap ldg with epilog computation
-      ldgNextGridStride(gridStrideX, gridStrideY);
-      epilog_op(acc, nullptr, nullptr, gridStrideX, gridStrideY);
+    for (int i = 0; i < P::AccColsPerTh; ++i) {
+      regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)];
     }
+  }
 
-    if (writeOut) {
-      IdxT starty = gridStrideY + this->accrowid;
-      IdxT startx = gridStrideX + this->acccolid;
+  DI void store_output(IdxT tile_idx_m, IdxT tile_idx_n)
+  {
+    IdxT starty = tile_idx_m + this->accrowid;
+    IdxT startx = tile_idx_n + this->acccolid;
 
 #pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto rowId = starty + i * P::AccThRows;
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      auto rowId = starty + i * P::AccThRows;
 #pragma unroll
-        for (int j = 0; j < P::AccColsPerTh; ++j) {
-          auto colId = startx + j * P::AccThCols;
-          if (rowId < this->m && colId < this->n) {
-            // Promote to 64 bit index for final write, as output array can be > 2^31
-            dOutput[std::size_t(rowId) * this->n + colId] = fin_op(acc[i][j], 0);
-          }
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto colId = startx + j * P::AccThCols;
+        if (rowId < this->m && colId < this->n) {
+          // Promote to 64 bit index for final write, as output array can be > 2^31
+          dOutput[std::size_t(rowId) * this->n + colId] = fin_op(acc[i][j], 0);
         }
       }
     }
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 1c069fc397..93a5ce7f1a 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <rmm/device_uvector.hpp>
@@ -238,7 +238,7 @@ void distance(const InType* x,
  * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        const Type* x,
                        const Type* y,
                        Type* dist,
@@ -333,7 +333,7 @@ void pairwise_distance(const raft::handle_t& handle,
  * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        const Type* x,
                        const Type* y,
                        Type* dist,
@@ -363,12 +363,12 @@ void pairwise_distance(const raft::handle_t& handle,
  *
  * Usage example:
  * @code{.cpp}
- * #include <raft/core/handle.hpp>
+ * #include <raft/core/device_resources.hpp>
  * #include <raft/core/device_mdarray.hpp>
  * #include <raft/random/make_blobs.cuh>
  * #include <raft/distance/distance.cuh>
  *
- * raft::handle_t handle;
+ * raft::raft::device_resources handle;
  * int n_samples = 5000;
  * int n_features = 50;
  *
@@ -398,7 +398,7 @@ template <raft::distance::DistanceType distanceType,
           typename OutType,
           typename layout = raft::layout_c_contiguous,
           typename Index_ = int>
-void distance(raft::handle_t const& handle,
+void distance(raft::device_resources const& handle,
               raft::device_matrix_view<InType, Index_, layout> const x,
               raft::device_matrix_view<InType, Index_, layout> const y,
               raft::device_matrix_view<OutType, Index_, layout> dist,
@@ -441,7 +441,7 @@ void distance(raft::handle_t const& handle,
  * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename layout = layout_c_contiguous, typename Index_ = int>
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        device_matrix_view<Type, Index_, layout> const x,
                        device_matrix_view<Type, Index_, layout> const y,
                        device_matrix_view<Type, Index_, layout> dist,
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index 1f677e919d..e832bcb020 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <cub/cub.cuh>
 #include <limits>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/linalg/contractions.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -50,7 +50,8 @@ using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
  * Initialize array using init value from reduction op
  */
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+void initialize(
+  raft::device_resources const& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
 {
   detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
 }
diff --git a/cpp/include/raft/distance/masked_nn.cuh b/cpp/include/raft/distance/masked_nn.cuh
new file mode 100644
index 0000000000..ea2e10a304
--- /dev/null
+++ b/cpp/include/raft/distance/masked_nn.cuh
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MASKED_L2_NN_H
+#define __MASKED_L2_NN_H
+
+#pragma once
+
+#include <limits>
+#include <raft/distance/detail/masked_nn.cuh>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/handle.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <stdint.h>
+
+namespace raft {
+namespace distance {
+/**
+ * \defgroup masked_nn Masked 1-nearest neighbors
+ * @{
+ */
+
+/**
+ * @brief Parameter struct for maskedL2NN function
+ *
+ * @tparam ReduceOpT    Type of reduction operator in the epilogue.
+ * @tparam KVPReduceOpT Type of Reduction operation on key value pairs.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * #include <raft/distance/masked_nn.cuh>
+ *
+ * using IdxT        = int;
+ * using DataT       = float;
+ * using RedOpT      = raft::distance::MinAndDistanceReduceOp<IdxT, DataT>;
+ * using PairRedOpT  = raft::distance::KVPMinReduce<IdxT, DataT>;
+ * using ParamT      = raft::distance::MaskedL2NNParams<RedOpT, PairRedOpT>;
+ *
+ * bool init_out = true;
+ * bool sqrt     = false;
+ *
+ * ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, sqrt, init_out};
+ * @endcode
+ *
+ * Prescribes how to reduce a distance to an intermediate type (`redOp`), and
+ * how to reduce two intermediate types (`pairRedOp`). Typically, a distance is
+ * mapped to an (index, value) pair and (index, value) pair with the lowest
+ * value (distance) is selected.
+ *
+ * In addition, prescribes whether to compute the square root of the distance
+ * (`sqrt`) and whether to initialize the output buffer (`initOutBuffer`).
+ */
+template <typename ReduceOpT, typename KVPReduceOpT>
+struct MaskedL2NNParams {
+  /** Reduction operator in the epilogue */
+  ReduceOpT redOp;
+  /** Reduction operation on key value pairs */
+  KVPReduceOpT pairRedOp;
+  /** Whether the output `minDist` should contain L2-sqrt */
+  bool sqrt;
+  /** Whether to initialize the output buffer before the main kernel launch */
+  bool initOutBuffer;
+};
+
+/**
+ * @brief Masked L2 distance and 1-nearest-neighbor computation in a single call.
+ *
+ * This function enables faster computation of nearest neighbors if the
+ * computation of distances between certain point pairs can be skipped.
+ *
+ * We use an adjacency matrix that describes which distances to calculate. The
+ * points in `y` are divided into groups, and the adjacency matrix indicates
+ * whether to compute distances between points in `x` and groups in `y`. In other
+ * words, if `adj[i,k]` is true then distance between point `x_i`, and points in
+ * `group_k` will be calculated.
+ *
+ * **Performance considerations**
+ *
+ * The points in `x` are processed in tiles of `M` points (`M` is currently 64,
+ * but may change in the future). As a result, the largest compute time
+ * reduction occurs if all `M` points can skip a group. If only part of the `M`
+ * points can skip a group, then at most a minor compute time reduction and a
+ * modest energy use reduction can be expected.
+ *
+ * The points in `y` are also grouped into tiles of `N` points (`N` is currently
+ * 64, but may change in the future). As a result, group sizes should be larger
+ * than `N` to avoid wasting computational resources. If the group sizes are
+ * evenly divisible by `N`, then the computation is most efficient, although for
+ * larger group sizes this effect is minor.
+ *
+ *
+ * **Comparison to SDDM**
+ *
+ * [SDDMM](https://ieeexplore.ieee.org/document/8638042) (sampled dense-dense
+ * matrix multiplication) is a matrix-matrix multiplication where only part of
+ * the output is computed. Compared to maskedL2NN, there are a few differences:
+ *
+ * - The output of maskedL2NN is a single vector (of nearest neighbors) and not
+ *   a sparse matrix.
+ *
+ * - The sampling in maskedL2NN is expressed through intermediate "groups"
+     rather than a CSR format.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances or store only the min distances. Accordingly, one
+ *                   has to pass an appropriate `ReduceOpT`
+ * @tparam IdxT      indexing arithmetic type
+ * @tparam ReduceOpT A struct to perform the final needed reduction operation
+ *                   and also to initialize the output array elements with the
+ *                   appropriate initial value needed for reduction.
+ *
+ * @param handle             RAFT handle for managing expensive resources
+ * @param params             Parameter struct specifying the reduction operations.
+ * @param[in]  x             First matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             Second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  x_norm        L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  y_norm        L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  adj           A boolean adjacency matrix indicating for each
+ *                           row of `x` and each group in `y` whether to compute the
+ *                           distance. Dim = `m x num_groups`.
+ * @param[in]  group_idxs    An array containing the *end* indices of each group
+ *                           in `y`. The value of group_idxs[j] indicates the
+ *                           start of group j + 1, i.e., it is the inclusive
+ *                           scan of the group lengths. The first group is
+ *                           always assumed to start at index 0 and the last
+ *                           group typically ends at index `n`. Length =
+ *                           `num_groups`.
+ * @param[out] out           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void maskedL2NN(raft::device_resources const& handle,
+                raft::distance::MaskedL2NNParams<ReduceOpT, KVPReduceOpT> params,
+                raft::device_matrix_view<const DataT, IdxT, raft::layout_c_contiguous> x,
+                raft::device_matrix_view<const DataT, IdxT, raft::layout_c_contiguous> y,
+                raft::device_vector_view<const DataT, IdxT, raft::layout_c_contiguous> x_norm,
+                raft::device_vector_view<const DataT, IdxT, raft::layout_c_contiguous> y_norm,
+                raft::device_matrix_view<const bool, IdxT, raft::layout_c_contiguous> adj,
+                raft::device_vector_view<const IdxT, IdxT, raft::layout_c_contiguous> group_idxs,
+                raft::device_vector_view<OutT, IdxT, raft::layout_c_contiguous> out)
+{
+  IdxT m          = x.extent(0);
+  IdxT n          = y.extent(0);
+  IdxT k          = x.extent(1);
+  IdxT num_groups = group_idxs.extent(0);
+
+  // Match k dimension of x, y
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Dimension of vectors in x and y must be equal.");
+  // Match x, x_norm and y, y_norm
+  RAFT_EXPECTS(m == x_norm.extent(0), "Length of `x_norm` must match input `x`.");
+  RAFT_EXPECTS(n == y_norm.extent(0), "Length of `y_norm` must match input `y` ");
+  // Match adj to x and group_idxs
+  RAFT_EXPECTS(m == adj.extent(0), "#rows in `adj` must match input `x`.");
+  RAFT_EXPECTS(num_groups == adj.extent(1), "#cols in `adj` must match length of `group_idxs`.");
+  // NOTE: We do not check if all indices in group_idxs actually points *inside* y.
+
+  // If there is no work to be done, return immediately.
+  if (m == 0 || n == 0 || k == 0 || num_groups == 0) { return; }
+
+  detail::maskedL2NNImpl<DataT, OutT, IdxT, ReduceOpT>(handle,
+                                                       out.data_handle(),
+                                                       x.data_handle(),
+                                                       y.data_handle(),
+                                                       x_norm.data_handle(),
+                                                       y_norm.data_handle(),
+                                                       adj.data_handle(),
+                                                       group_idxs.data_handle(),
+                                                       num_groups,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       params.redOp,
+                                                       params.pairRedOp,
+                                                       params.sqrt,
+                                                       params.initOutBuffer);
+}
+
+/** @} */
+
+}  // namespace distance
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 4525af49d2..caa68061db 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,4 +21,4 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 27ab24abe8..608c63e1a9 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,7 +95,7 @@ void addDevScalar(
  * @brief Elementwise add operation
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1    First Input
  * @param[in] in2    Second Input
  * @param[out] out    Output
@@ -104,7 +104,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void add(const raft::handle_t& handle, InType in1, InType in2, OutType out)
+void add(raft::device_resources const& handle, InType in1, InType in2, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -135,7 +135,7 @@ void add(const raft::handle_t& handle, InType in1, InType in2, OutType out)
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[in] scalar    raft::device_scalar_view
  * @param[in] out    Output
@@ -145,7 +145,7 @@ template <typename InType,
           typename ScalarIdxType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void add_scalar(const raft::handle_t& handle,
+void add_scalar(raft::device_resources const& handle,
                 InType in,
                 OutType out,
                 raft::device_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
@@ -177,7 +177,7 @@ void add_scalar(const raft::handle_t& handle,
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[in] scalar    raft::host_scalar_view
  * @param[in] out    Output
@@ -187,7 +187,7 @@ template <typename InType,
           typename ScalarIdxType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void add_scalar(const raft::handle_t& handle,
+void add_scalar(raft::device_resources const& handle,
                 const InType in,
                 OutType out,
                 raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/axpy.cuh b/cpp/include/raft/linalg/axpy.cuh
index 35a34bc2b5..9b3af73234 100644
--- a/cpp/include/raft/linalg/axpy.cuh
+++ b/cpp/include/raft/linalg/axpy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace raft::linalg {
  * @param [in] stream
  */
 template <typename T, bool DevicePointerMode = false>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           const int n,
           const T* alpha,
           const T* x,
@@ -62,7 +62,7 @@ void axpy(const raft::handle_t& handle,
  * @brief axpy function
  *  It computes the following equation: y = alpha * x + y
  *
- * @param [in] handle raft::handle_t
+ * @param [in] handle raft::device_resources
  * @param [in] alpha raft::device_scalar_view
  * @param [in] x Input vector
  * @param [inout] y Output vector
@@ -72,7 +72,7 @@ template <typename ElementType,
           typename InLayoutPolicy,
           typename OutLayoutPolicy,
           typename ScalarIdxType>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           raft::device_scalar_view<const ElementType, ScalarIdxType> alpha,
           raft::device_vector_view<const ElementType, IndexType, InLayoutPolicy> x,
           raft::device_vector_view<ElementType, IndexType, OutLayoutPolicy> y)
@@ -92,7 +92,7 @@ void axpy(const raft::handle_t& handle,
 /**
  * @brief axpy function
  *  It computes the following equation: y = alpha * x + y
- * @param [in] handle raft::handle_t
+ * @param [in] handle raft::device_resources
  * @param [in] alpha raft::device_scalar_view
  * @param [in] x Input vector
  * @param [inout] y Output vector
@@ -102,7 +102,7 @@ template <typename ElementType,
           typename InLayoutPolicy,
           typename OutLayoutPolicy,
           typename ScalarIdxType>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           raft::host_scalar_view<const ElementType, ScalarIdxType> alpha,
           raft::device_vector_view<const ElementType, IndexType, InLayoutPolicy> x,
           raft::device_vector_view<ElementType, IndexType, OutLayoutPolicy> y)
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 693ef961c2..966e84965d 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/binary_op.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -65,7 +65,7 @@ void binaryOp(
  * @tparam InType Input Type raft::device_mdspan
  * @tparam Lambda the device-lambda performing the actual operation
  * @tparam OutType Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1 First input
  * @param[in] in2 Second input
  * @param[out] out Output
@@ -78,7 +78,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void binary_op(const raft::handle_t& handle, InType in1, InType in2, OutType out, Lambda op)
+void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index af8d12d873..e10f43653b 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -121,7 +121,7 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
+void choleskyRank1Update(raft::device_resources const& handle,
                          math_t* L,
                          int n,
                          int ld,
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 45cd640edc..674be207d8 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/coalesced_reduction.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/operators.hpp>
 
 namespace raft {
@@ -101,7 +101,7 @@ void coalescedReduction(OutType* dots,
  * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
  * It must be a 'callable' supporting the following input and output:
  * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] data Input of type raft::device_matrix_view
  * @param[out] dots Output of type raft::device_matrix_view
  * @param[in] init initial value to use for the reduction
@@ -117,7 +117,7 @@ template <typename InValueType,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalesced_reduction(const raft::handle_t& handle,
+void coalesced_reduction(raft::device_resources const& handle,
                          raft::device_matrix_view<const InValueType, IdxType, LayoutPolicy> data,
                          raft::device_vector_view<OutValueType, IdxType> dots,
                          OutValueType init,
diff --git a/cpp/include/raft/linalg/detail/axpy.cuh b/cpp/include/raft/linalg/detail/axpy.cuh
index f3e1a177c8..5747e840c4 100644
--- a/cpp/include/raft/linalg/detail/axpy.cuh
+++ b/cpp/include/raft/linalg/detail/axpy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,12 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft::linalg::detail {
 
 template <typename T, bool DevicePointerMode = false>
-void axpy(const raft::handle_t& handle,
+void axpy(raft::device_resources const& handle,
           const int n,
           const T* alpha,
           const T* x,
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index 47937815bd..afa9155753 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/binary_op.cuh>
 
 namespace raft {
@@ -26,7 +26,7 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
+void choleskyRank1Update(raft::device_resources const& handle,
                          math_t* L,
                          int n,
                          int ld,
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 5d83f88e71..b15cb222b4 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,14 +40,10 @@ struct Contractions_NT {
   /** leading dimension in Output D */
   IdxT ldd;
 
-  /** current thread's global mem row id for X data */
-  IdxT xrowid;
-  /** current thread's global mem row id for Y data */
-  IdxT yrowid;
   /** global memory pointer to X matrix */
-  const DataT* x;
+  const DataT* x_base;
   /** global memory pointer to Y matrix */
-  const DataT* y;
+  const DataT* y_base;
 
   /** current thread's smem row id */
   int srowid;
@@ -94,10 +90,8 @@ struct Contractions_NT {
       k(_k),
       lda(_k),
       ldb(_k),
-      xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow),
-      yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow),
-      x(_x + xrowid * lda),
-      y(_y + yrowid * ldb),
+      x_base(_x),
+      y_base(_y),
       srowid(threadIdx.x / P::LdgThRow),
       scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
       accrowid(threadIdx.x / P::AccThCols),
@@ -133,6 +127,8 @@ struct Contractions_NT {
       lda(_lda),
       ldb(_ldb),
       ldd(_ldd),
+      x_base(_x),
+      y_base(_y),
       srowid(threadIdx.x / P::LdgThRow),
       scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
       accrowid(threadIdx.x / P::AccThCols),
@@ -142,17 +138,6 @@ struct Contractions_NT {
       pageWr(0),
       pageRd(0)
   {
-    if (isRowMajor) {
-      xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
-      yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x      = _x + xrowid * lda;
-      y      = _y + yrowid * ldb;
-    } else {
-      xrowid = IdxT(blockIdx.y) * P::Mblk;
-      yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x      = _x + xrowid + srowid * lda;
-      y      = _y + yrowid + srowid * ldb;
-    }
   }
 
  protected:
@@ -160,10 +145,16 @@ struct Contractions_NT {
    * @brief Load current block of X/Y from global memory to registers
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void ldgXY(IdxT kidx)
+  DI void ldgXY(IdxT tile_idx_m, IdxT tile_idx_n, IdxT kidx)
+  {
+    ldgX(tile_idx_m, kidx);
+    ldgY(tile_idx_n, kidx);
+  }
+
+  DI void ldgXY(IdxT tile_idx_m, IdxT tile_idx_n, IdxT kidx, IdxT tile_end_n)
   {
-    ldgX(kidx);
-    ldgY(kidx);
+    ldgX(tile_idx_m, kidx);
+    ldgY(tile_idx_n, kidx, tile_end_n);
   }
 
   /**
@@ -186,9 +177,16 @@ struct Contractions_NT {
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
+  DI void switch_read_buffer() { this->pageRd ^= 1; }
+
+  DI void switch_write_buffer() { this->pageWr ^= 1; }
+
  private:
-  DI void ldgX(IdxT kidx)
+  DI void ldgX(IdxT tile_idx_m, IdxT kidx)
   {
+    IdxT xrowid = isRowMajor ? tile_idx_m + srowid : tile_idx_m;
+    auto x      = isRowMajor ? x_base + xrowid * lda : x_base + xrowid + srowid * lda;
+
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -220,10 +218,15 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx)
+  DI void ldgY(IdxT tile_idx_n, IdxT kidx) { ldgY(tile_idx_n, kidx, n); }
+
+  DI void ldgY(IdxT tile_idx_n, IdxT kidx, IdxT end_n)
   {
+    IdxT yrowid = isRowMajor ? tile_idx_n + srowid : tile_idx_n;
+    auto y      = isRowMajor ? y_base + yrowid * ldb : y_base + yrowid + srowid * ldb;
+
     if (isRowMajor) {
-      auto numRows = n;
+      auto numRows = end_n;
       auto koffset = kidx + scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
@@ -241,7 +244,7 @@ struct Contractions_NT {
       auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) {
+        if ((koffset + yrowid) < end_n && (srowid + kidx + i * P::LdgRowsY) < numRows) {
           ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
         } else {
 #pragma unroll
@@ -315,4 +318,4 @@ struct Contractions_NT {
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index d48b42fc57..94493efb24 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/matrix.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
@@ -29,7 +29,7 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void eigDC_legacy(const raft::handle_t& handle,
+void eigDC_legacy(raft::device_resources const& handle,
                   const math_t* in,
                   std::size_t n_rows,
                   std::size_t n_cols,
@@ -74,7 +74,7 @@ void eigDC_legacy(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void eigDC(const raft::handle_t& handle,
+void eigDC(raft::device_resources const& handle,
            const math_t* in,
            std::size_t n_rows,
            std::size_t n_cols,
@@ -137,7 +137,7 @@ void eigDC(const raft::handle_t& handle,
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
 
 template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
+void eigSelDC(raft::device_resources const& handle,
               math_t* in,
               std::size_t n_rows,
               std::size_t n_cols,
@@ -228,7 +228,7 @@ void eigSelDC(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
+void eigJacobi(raft::device_resources const& handle,
                const math_t* in,
                std::size_t n_rows,
                std::size_t n_cols,
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index baa066984b..ba9496c3b9 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -49,7 +49,7 @@ namespace detail {
  * @param [in] stream
  */
 template <typename math_t, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const bool trans_a,
           const bool trans_b,
           const int m,
@@ -103,7 +103,7 @@ void gemm(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -130,7 +130,7 @@ void gemm(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -149,7 +149,7 @@ void gemm(const raft::handle_t& handle,
 }
 
 template <typename T, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           T* z,
           T* x,
           T* y,
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 38fcdcd82e..b3e001a851 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,14 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
 namespace detail {
 
 template <typename math_t, bool DevicePointerMode = false>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const bool trans_a,
           const int m,
           const int n,
@@ -59,7 +59,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows,
           const int n_cols,
@@ -76,7 +76,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -91,7 +91,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -107,7 +107,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -126,7 +126,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 5a3c595512..8c0cfeba28 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include <curand.h>
 
 #include "cublas_wrappers.hpp"
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/spectral/detail/lapack.hpp>
 #include <raft/spectral/detail/warn_dbg.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
@@ -82,7 +82,7 @@ inline curandStatus_t curandGenerateNormalX(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
+int performLanczosIteration(raft::device_resources const& handle,
                             spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
                             index_type_t* iter,
                             index_type_t maxIter,
@@ -540,7 +540,7 @@ static int francisQRIteration(index_type_t n,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
+static int lanczosRestart(raft::device_resources const& handle,
                           index_type_t n,
                           index_type_t iter,
                           index_type_t iter_new,
@@ -743,7 +743,7 @@ static int lanczosRestart(handle_t const& handle,
  */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -984,7 +984,7 @@ int computeSmallestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1087,7 +1087,7 @@ int computeSmallestEigenvectors(
  */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1331,7 +1331,7 @@ int computeLargestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh
index 1273956b21..207bcefc32 100644
--- a/cpp/include/raft/linalg/detail/lstsq.cuh
+++ b/cpp/include/raft/linalg/detail/lstsq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ struct DivideByNonZero {
 
   operator()(const math_t a, const math_t b) const
   {
-    return raft::myAbs<math_t>(b) >= eps ? a / b : a;
+    return raft::abs<math_t>(b) >= eps ? a / b : a;
   }
 };
 
@@ -117,7 +117,7 @@ struct DivideByNonZero {
  *             so it's not guaranteed to stay unmodified.
  */
 template <typename math_t>
-void lstsqSvdQR(const raft::handle_t& handle,
+void lstsqSvdQR(raft::device_resources const& handle,
                 math_t* A,
                 const int n_rows,
                 const int n_cols,
@@ -177,7 +177,7 @@ void lstsqSvdQR(const raft::handle_t& handle,
  *             so it's not guaranteed to stay unmodified.
  */
 template <typename math_t>
-void lstsqSvdJacobi(const raft::handle_t& handle,
+void lstsqSvdJacobi(raft::device_resources const& handle,
                     math_t* A,
                     const int n_rows,
                     const int n_cols,
@@ -248,7 +248,7 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
  *  (`w = (A^T A)^-1  A^T b`)
  */
 template <typename math_t>
-void lstsqEig(const raft::handle_t& handle,
+void lstsqEig(raft::device_resources const& handle,
               const math_t* A,
               const int n_rows,
               const int n_cols,
@@ -352,7 +352,7 @@ void lstsqEig(const raft::handle_t& handle,
  *            Warning: the content of this vector is modified by the cuSOLVER routines.
  */
 template <typename math_t>
-void lstsqQR(const raft::handle_t& handle,
+void lstsqQR(raft::device_resources const& handle,
              math_t* A,
              const int n_rows,
              const int n_cols,
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index add003eb52..e0b473bdd4 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 7ef9ca1c43..70bb2df4f5 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 74e9c3e1aa..4cba028d87 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ namespace detail {
  */
 template <typename math_t>
 void qrGetQ_inplace(
-  const raft::handle_t& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream)
+  raft::device_resources const& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream)
 {
   RAFT_EXPECTS(n_rows >= n_cols, "QR decomposition expects n_rows >= n_cols.");
   cusolverDnHandle_t cusolver = handle.get_cusolver_dn_handle();
@@ -83,7 +83,7 @@ void qrGetQ_inplace(
 }
 
 template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
+void qrGetQ(raft::device_resources const& handle,
             const math_t* M,
             math_t* Q,
             int n_rows,
@@ -95,7 +95,7 @@ void qrGetQ(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
+void qrGetQR(raft::device_resources const& handle,
              math_t* M,
              math_t* Q,
              math_t* R,
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index f96598d9e6..a66a23179b 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ namespace detail {
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdFixedRank(const raft::handle_t& handle,
+void rsvdFixedRank(raft::device_resources const& handle,
                    math_t* M,
                    int n_rows,
                    int n_cols,
@@ -371,7 +371,7 @@ void rsvdFixedRank(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdPerc(const raft::handle_t& handle,
+void rsvdPerc(raft::device_resources const& handle,
               math_t* M,
               int n_rows,
               int n_cols,
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index 8626c7888b..4850744f51 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <raft/linalg/transpose.cuh>
 
 #include <raft/common/nvtx.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/math.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -36,7 +36,7 @@ namespace linalg {
 namespace detail {
 
 template <typename T>
-void svdQR(const raft::handle_t& handle,
+void svdQR(raft::device_resources const& handle,
            T* in,
            int n_rows,
            int n_cols,
@@ -102,7 +102,7 @@ void svdQR(const raft::handle_t& handle,
 }
 
 template <typename math_t, typename idx_t>
-void svdEig(const raft::handle_t& handle,
+void svdEig(raft::device_resources const& handle,
             math_t* in,
             idx_t n_rows,
             idx_t n_cols,
@@ -162,7 +162,7 @@ void svdEig(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
+void svdJacobi(raft::device_resources const& handle,
                math_t* in,
                int n_rows,
                int n_cols,
@@ -232,7 +232,7 @@ void svdJacobi(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
+void svdReconstruction(raft::device_resources const& handle,
                        math_t* U,
                        math_t* S,
                        math_t* V,
@@ -263,7 +263,7 @@ void svdReconstruction(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+bool evaluateSVDByL2Norm(raft::device_resources const& handle,
                          math_t* A_d,
                          math_t* U,
                          math_t* S_vec,
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index ef5551ea7e..9e7b236fed 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "cublas_wrappers.hpp"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/exec_policy.hpp>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -29,7 +29,7 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void transpose(const raft::handle_t& handle,
+void transpose(raft::device_resources const& handle,
                math_t* in,
                math_t* out,
                int n_rows,
@@ -82,7 +82,7 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_row_major_impl(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
@@ -108,7 +108,7 @@ void transpose_row_major_impl(
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_col_major_impl(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index 526d8a9716..0b18e6175c 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void divideScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[in] scalar    raft::host_scalar_view
  * @param[out] out    Output
@@ -66,7 +66,7 @@ template <typename InType,
           typename ScalarIdxType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void divide_scalar(const raft::handle_t& handle,
+void divide_scalar(raft::device_resources const& handle,
                    InType in,
                    OutType out,
                    raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/dot.cuh b/cpp/include/raft/linalg/dot.cuh
index 4b1bc913e1..917188d695 100644
--- a/cpp/include/raft/linalg/dot.cuh
+++ b/cpp/include/raft/linalg/dot.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 
 namespace raft::linalg {
@@ -33,7 +33,7 @@ namespace raft::linalg {
 
 /**
  * @brief Computes the dot product of two vectors.
- * @param[in] handle   raft::handle_t
+ * @param[in] handle   raft::device_resources
  * @param[in] x        First input vector
  * @param[in] y        Second input vector
  * @param[out] out     The output dot product between the x and y vectors.
@@ -43,7 +43,7 @@ template <typename ElementType,
           typename ScalarIndexType,
           typename LayoutPolicy1,
           typename LayoutPolicy2>
-void dot(const raft::handle_t& handle,
+void dot(raft::device_resources const& handle,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy1> x,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy2> y,
          raft::device_scalar_view<ElementType, ScalarIndexType> out)
@@ -63,7 +63,7 @@ void dot(const raft::handle_t& handle,
 
 /**
  * @brief Computes the dot product of two vectors.
- * @param[in] handle   raft::handle_t
+ * @param[in] handle   raft::device_resources
  * @param[in] x        First input vector
  * @param[in] y        Second input vector
  * @param[out] out     The output dot product between the x and y vectors.
@@ -73,7 +73,7 @@ template <typename ElementType,
           typename ScalarIndexType,
           typename LayoutPolicy1,
           typename LayoutPolicy2>
-void dot(const raft::handle_t& handle,
+void dot(raft::device_resources const& handle,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy1> x,
          raft::device_vector_view<const ElementType, IndexType, LayoutPolicy2> y,
          raft::host_scalar_view<ElementType, ScalarIndexType> out)
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 271ff13db5..03e94a10b1 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void eigDC(const raft::handle_t& handle,
+void eigDC(raft::device_resources const& handle,
            const math_t* in,
            std::size_t n_rows,
            std::size_t n_cols,
@@ -68,7 +68,7 @@ using detail::OVERWRITE_INPUT;
  * @param stream cuda stream
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
+void eigSelDC(raft::device_resources const& handle,
               math_t* in,
               std::size_t n_rows,
               std::size_t n_cols,
@@ -97,7 +97,7 @@ void eigSelDC(const raft::handle_t& handle,
  * accuracy.
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
+void eigJacobi(raft::device_resources const& handle,
                const math_t* in,
                std::size_t n_rows,
                std::size_t n_cols,
@@ -120,14 +120,14 @@ void eigJacobi(const raft::handle_t& handle,
  * symmetric matrices
  * @tparam ValueType the data-type of input and output
  * @tparam IntegerType Integer used for addressing
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and
  * vectors)
  * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view
  * @param[out] eig_vals: eigen values output of type raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void eig_dc(const raft::handle_t& handle,
+void eig_dc(raft::device_resources const& handle,
             raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
             raft::device_matrix_view<ValueType, IndexType, raft::col_major> eig_vectors,
             raft::device_vector_view<ValueType, IndexType> eig_vals)
@@ -149,7 +149,7 @@ void eig_dc(const raft::handle_t& handle,
  *        for the column-major symmetric matrices
  * @tparam ValueType the data-type of input and output
  * @tparam IntegerType Integer used for addressing
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and
  * vectors)
  * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view
@@ -158,7 +158,7 @@ void eig_dc(const raft::handle_t& handle,
  * @param[in] memUsage: the memory selection for eig vector output
  */
 template <typename ValueType, typename IndexType>
-void eig_dc_selective(const raft::handle_t& handle,
+void eig_dc_selective(raft::device_resources const& handle,
                       raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
                       raft::device_matrix_view<ValueType, IndexType, raft::col_major> eig_vectors,
                       raft::device_vector_view<ValueType, IndexType> eig_vals,
@@ -185,7 +185,7 @@ void eig_dc_selective(const raft::handle_t& handle,
  * column-major symmetric matrices (in parameter)
  * @tparam ValueType the data-type of input and output
  * @tparam IntegerType Integer used for addressing
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] in input raft::device_matrix_view (symmetric matrix that has real eig values and
  * vectors)
  * @param[out] eig_vectors: eigenvectors output of type raft::device_matrix_view
@@ -196,7 +196,7 @@ void eig_dc_selective(const raft::handle_t& handle,
  * accuracy.
  */
 template <typename ValueType, typename IndexType>
-void eig_jacobi(const raft::handle_t& handle,
+void eig_jacobi(raft::device_resources const& handle,
                 raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
                 raft::device_matrix_view<ValueType, IndexType, raft::col_major> eig_vectors,
                 raft::device_vector_view<ValueType, IndexType> eig_vals,
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index f2354da6c6..d5dc5ffab5 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ namespace linalg {
  * @param [in] stream
  */
 template <typename math_t, bool DevicePointerMode = false>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const bool trans_a,
           const bool trans_b,
           const int m,
@@ -91,7 +91,7 @@ void gemm(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -126,7 +126,7 @@ void gemm(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           const math_t* a,
           int n_rows_a,
           int n_cols_a,
@@ -161,7 +161,7 @@ void gemm(const raft::handle_t& handle,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           T* z,
           T* x,
           T* y,
@@ -213,7 +213,7 @@ template <typename ValueType,
           typename                = std::enable_if_t<std::disjunction_v<
             std::is_same<ScalarViewType, raft::host_scalar_view<ValueType, ScalarIdxType>>,
             std::is_same<ScalarViewType, raft::device_scalar_view<ValueType, ScalarIdxType>>>>>
-void gemm(const raft::handle_t& handle,
+void gemm(raft::device_resources const& handle,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyX> x,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyY> y,
           raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ> z,
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
index 8132a742f8..96846003f6 100644
--- a/cpp/include/raft/linalg/gemv.cuh
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ namespace linalg {
  * @param [in] stream
  */
 template <typename math_t, bool DevicePointerMode = false>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const bool trans_a,
           const int m,
           const int n,
@@ -69,7 +69,7 @@ void gemv(const raft::handle_t& handle,
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows,
           const int n_cols,
@@ -103,7 +103,7 @@ void gemv(const raft::handle_t& handle,
  * @param stream stream on which this function is run
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -133,7 +133,7 @@ void gemv(const raft::handle_t& handle,
  * @param stream stream on which this function is run
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -165,7 +165,7 @@ void gemv(const raft::handle_t& handle,
  * @param stream stream on which this function is run
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -199,7 +199,7 @@ void gemv(const raft::handle_t& handle,
  *
  */
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           const math_t* A,
           const int n_rows_a,
           const int n_cols_a,
@@ -246,7 +246,7 @@ template <typename ValueType,
           typename                = std::enable_if_t<std::disjunction_v<
             std::is_same<ScalarViewType, raft::host_scalar_view<ValueType, ScalarIdxType>>,
             std::is_same<ScalarViewType, raft::device_scalar_view<ValueType, ScalarIdxType>>>>>
-void gemv(const raft::handle_t& handle,
+void gemv(raft::device_resources const& handle,
           raft::device_matrix_view<const ValueType, IndexType, LayoutPolicy> A,
           raft::device_vector_view<const ValueType, IndexType> x,
           raft::device_vector_view<ValueType, IndexType> y,
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
index 7654812886..b36a9eba96 100644
--- a/cpp/include/raft/linalg/lstsq.cuh
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/lstsq.cuh>
 namespace raft {
 namespace linalg {
@@ -37,7 +37,7 @@ namespace linalg {
  * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
-void lstsqSvdQR(const raft::handle_t& handle,
+void lstsqSvdQR(raft::device_resources const& handle,
                 math_t* A,
                 const int n_rows,
                 const int n_cols,
@@ -62,7 +62,7 @@ void lstsqSvdQR(const raft::handle_t& handle,
  * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
-void lstsqSvdJacobi(const raft::handle_t& handle,
+void lstsqSvdJacobi(raft::device_resources const& handle,
                     math_t* A,
                     const int n_rows,
                     const int n_cols,
@@ -78,7 +78,7 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
  *  (`w = (A^T A)^-1  A^T b`)
  */
 template <typename math_t>
-void lstsqEig(const raft::handle_t& handle,
+void lstsqEig(raft::device_resources const& handle,
               const math_t* A,
               const int n_rows,
               const int n_cols,
@@ -104,7 +104,7 @@ void lstsqEig(const raft::handle_t& handle,
  * @param[in] stream cuda stream for ordering operations
  */
 template <typename math_t>
-void lstsqQR(const raft::handle_t& handle,
+void lstsqQR(raft::device_resources const& handle,
              math_t* A,
              const int n_rows,
              const int n_cols,
@@ -125,7 +125,7 @@ void lstsqQR(const raft::handle_t& handle,
  * Via SVD decomposition of `A = U S Vt`.
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified.
  * @param[inout] b input target raft::device_vector_view
@@ -133,7 +133,7 @@ void lstsqQR(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_svd_qr(const raft::handle_t& handle,
+void lstsq_svd_qr(raft::device_resources const& handle,
                   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
                   raft::device_vector_view<const ValueType, IndexType> b,
                   raft::device_vector_view<ValueType, IndexType> w)
@@ -155,7 +155,7 @@ void lstsq_svd_qr(const raft::handle_t& handle,
  *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations.
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified.
  * @param[inout] b input target raft::device_vector_view
@@ -163,7 +163,7 @@ void lstsq_svd_qr(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_svd_jacobi(const raft::handle_t& handle,
+void lstsq_svd_jacobi(raft::device_resources const& handle,
                       raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
                       raft::device_vector_view<const ValueType, IndexType> b,
                       raft::device_vector_view<ValueType, IndexType> w)
@@ -186,7 +186,7 @@ void lstsq_svd_jacobi(const raft::handle_t& handle,
  *  (`w = (A^T A)^-1  A^T b`)
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified by the cuSOLVER routines.
  * @param[inout] b input target raft::device_vector_view
@@ -194,7 +194,7 @@ void lstsq_svd_jacobi(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_eig(const raft::handle_t& handle,
+void lstsq_eig(raft::device_resources const& handle,
                raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
                raft::device_vector_view<const ValueType, IndexType> b,
                raft::device_vector_view<ValueType, IndexType> w)
@@ -217,7 +217,7 @@ void lstsq_eig(const raft::handle_t& handle,
  *  (triangular system of equations `Rw = Q^T b`)
  *
  * @tparam ValueType the data-type of input/output
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[inout] A input raft::device_matrix_view
  *            Warning: the content of this matrix is modified.
  * @param[inout] b input target raft::device_vector_view
@@ -225,7 +225,7 @@ void lstsq_eig(const raft::handle_t& handle,
  * @param[out] w output coefficient raft::device_vector_view
  */
 template <typename ValueType, typename IndexType>
-void lstsq_qr(const raft::handle_t& handle,
+void lstsq_qr(raft::device_resources const& handle,
               raft::device_matrix_view<const ValueType, IndexType, raft::col_major> A,
               raft::device_vector_view<const ValueType, IndexType> b,
               raft::device_vector_view<ValueType, IndexType> w)
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index ad35cc5880..2b9e6c80a0 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,9 @@
 #include "detail/map.cuh"
 
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/input_validation.hpp>
+#include <thrust/tabulate.h>
 
 namespace raft {
 namespace linalg {
@@ -65,7 +67,7 @@ void map_k(
  * @tparam TPB threads-per-block in the final kernel launched
  * @tparam OutType data-type of result of type raft::device_mdspan
  * @tparam Args additional parameters
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input of type raft::device_mdspan
  * @param[out] out the output of the map operation of type raft::device_mdspan
  * @param[in] map the device-lambda
@@ -78,7 +80,7 @@ template <typename InType,
           typename... Args,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void map(const raft::handle_t& handle, InType in, OutType out, MapOp map, Args... args)
+void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -96,9 +98,43 @@ void map(const raft::handle_t& handle, InType in, OutType out, MapOp map, Args..
   }
 }
 
+/**
+ * @brief Perform an element-wise unary operation on the input offset into the output array
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_mdarray.hpp>
+ *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/operators.hpp>
+ *  #include <raft/linalg/map.cuh>
+ *  ...
+ *  raft::handle_t handle;
+ *  auto squares = raft::make_device_vector<int>(handle, n);
+ *  raft::linalg::map_offset(handle, squares.view(), raft::sq_op());
+ * @endcode
+ *
+ * @tparam OutType Output mdspan type
+ * @tparam MapOp   The unary operation type with signature `OutT func(const IdxT& idx);`
+ * @param[in]  handle The raft handle
+ * @param[out] out    Output array
+ * @param[in]  op     The unary operation
+ */
+template <typename OutType,
+          typename MapOp,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void map_offset(const raft::device_resources& handle, OutType out, MapOp op)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+
+  using out_value_t = typename OutType::value_type;
+
+  thrust::tabulate(
+    handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op);
+}
+
 /** @} */  // end of map
 
 }  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh
index 4158d35bca..b89f3bdd54 100644
--- a/cpp/include/raft/linalg/map_reduce.cuh
+++ b/cpp/include/raft/linalg/map_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ void mapReduce(OutType* out,
  * @tparam OutValueType the data-type of the output
  * @tparam ScalarIdxType index type of scalar
  * @tparam Args additional parameters
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input of type raft::device_vector_view
  * @param[in] neutral The neutral element of the reduction operation. For example:
  *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
@@ -91,7 +91,7 @@ template <typename InValueType,
           typename OutValueType,
           typename ScalarIdxType,
           typename... Args>
-void map_reduce(const raft::handle_t& handle,
+void map_reduce(raft::device_resources const& handle,
                 raft::device_vector_view<const InValueType, IndexType> in,
                 raft::device_scalar_view<OutValueType, ScalarIdxType> out,
                 OutValueType neutral,
diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
index 5529ded16f..fa24ea28b7 100644
--- a/cpp/include/raft/linalg/matrix_vector.cuh
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace raft::linalg {
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_mult_skip_zero(const raft::handle_t& handle,
+void binary_mult_skip_zero(raft::device_resources const& handle,
                            raft::device_matrix_view<math_t, idx_t, layout_t> data,
                            raft::device_vector_view<const math_t, idx_t> vec,
                            Apply apply)
@@ -70,7 +70,7 @@ void binary_mult_skip_zero(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_div(const raft::handle_t& handle,
+void binary_div(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout_t> data,
                 raft::device_vector_view<const math_t, idx_t> vec,
                 Apply apply)
@@ -105,7 +105,7 @@ void binary_div(const raft::handle_t& handle,
  * value if false
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_div_skip_zero(const raft::handle_t& handle,
+void binary_div_skip_zero(raft::device_resources const& handle,
                           raft::device_matrix_view<math_t, idx_t, layout_t> data,
                           raft::device_vector_view<const math_t, idx_t> vec,
                           Apply apply,
@@ -140,7 +140,7 @@ void binary_div_skip_zero(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_add(const raft::handle_t& handle,
+void binary_add(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout_t> data,
                 raft::device_vector_view<const math_t, idx_t> vec,
                 Apply apply)
@@ -173,7 +173,7 @@ void binary_add(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
 template <typename math_t, typename idx_t, typename layout_t>
-void binary_sub(const raft::handle_t& handle,
+void binary_sub(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout_t> data,
                 raft::device_vector_view<const math_t, idx_t> vec,
                 Apply apply)
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 8b5163a714..59b2ca5ee5 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,7 @@ void matrixVectorOp(MatT* out,
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IndexType Integer used for addressing
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] matrix input raft::matrix_view
  * @param[in] vec vector raft::vector_view
  * @param[out] out output raft::matrix_view
@@ -135,7 +135,7 @@ template <typename MatValueType,
           typename LayoutPolicy,
           typename Lambda,
           typename IndexType>
-void matrix_vector_op(const raft::handle_t& handle,
+void matrix_vector_op(raft::device_resources const& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const VecValueType, IndexType> vec,
                       raft::device_matrix_view<MatValueType, IndexType, LayoutPolicy> out,
@@ -182,7 +182,7 @@ void matrix_vector_op(const raft::handle_t& handle,
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IndexType Integer used for addressing
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param matrix input raft::matrix_view
  * @param vec1 the first vector raft::vector_view
  * @param vec2 the second vector raft::vector_view
@@ -197,7 +197,7 @@ template <typename MatValueType,
           typename LayoutPolicy,
           typename Lambda,
           typename IndexType>
-void matrix_vector_op(const raft::handle_t& handle,
+void matrix_vector_op(raft::device_resources const& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const Vec1ValueType, IndexType> vec1,
                       raft::device_vector_view<const Vec2ValueType, IndexType> vec2,
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index a3360ae35a..62f4896d01 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,14 +53,14 @@ void meanSquaredError(
  * @tparam IndexType Input/Output index type
  * @tparam OutValueType Output data-type
  * @tparam TPB threads-per-block
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] A input raft::device_vector_view
  * @param[in] B input raft::device_vector_view
  * @param[out] out the output mean squared error value of type raft::device_scalar_view
  * @param[in] weight weight to apply to every term in the mean squared error calculation
  */
 template <typename InValueType, typename IndexType, typename OutValueType>
-void mean_squared_error(const raft::handle_t& handle,
+void mean_squared_error(raft::device_resources const& handle,
                         raft::device_vector_view<const InValueType, IndexType> A,
                         raft::device_vector_view<const InValueType, IndexType> B,
                         raft::device_scalar_view<OutValueType, IndexType> out,
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 119cf667d1..574b88c63d 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void multiplyScalar(out_t* out, const in_t* in, in_t scalar, IdxType len, cudaSt
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input buffer
  * @param[out] out the output buffer
  * @param[in] scalar the scalar used in the operations
@@ -68,7 +68,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void multiply_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index b64b128fa2..8bc6720b4e 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,7 +99,7 @@ void colNorm(Type* dots,
  * @tparam LayoutPolicy the layout of input (raft::row_major or raft::col_major)
  * @tparam IdxType Integer type used to for addressing
  * @tparam Lambda device final lambda
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input raft::device_matrix_view
  * @param[out] out the output raft::device_vector_view
  * @param[in] type the type of norm to be applied
@@ -111,7 +111,7 @@ template <typename ElementType,
           typename LayoutPolicy,
           typename IndexType,
           typename Lambda = raft::identity_op>
-void norm(const raft::handle_t& handle,
+void norm(raft::device_resources const& handle,
           raft::device_matrix_view<const ElementType, IndexType, LayoutPolicy> in,
           raft::device_vector_view<ElementType, IndexType> out,
           NormType type,
diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
index bf6ef5a570..027ebb16e8 100644
--- a/cpp/include/raft/linalg/normalize.cuh
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace linalg {
  * @tparam MainLambda Type of main_op
  * @tparam ReduceLambda Type of reduce_op
  * @tparam FinalLambda Type of fin_op
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input raft::device_matrix_view
  * @param[out] out the output raft::device_matrix_view
  * @param[in] init Initialization value, i.e identity element for the reduction operation
@@ -52,7 +52,7 @@ template <typename ElementType,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-void row_normalize(const raft::handle_t& handle,
+void row_normalize(raft::device_resources const& handle,
                    raft::device_matrix_view<const ElementType, IndexType, row_major> in,
                    raft::device_matrix_view<ElementType, IndexType, row_major> out,
                    ElementType init,
@@ -85,14 +85,14 @@ void row_normalize(const raft::handle_t& handle,
  *
  * @tparam ElementType Input/Output data type
  * @tparam IndexType Integer type used to for addressing
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in the input raft::device_matrix_view
  * @param[out] out the output raft::device_matrix_view
  * @param[in] norm_type the type of norm to be applied
  * @param[in] eps If the norm is below eps, the row is considered zero and no division is applied
  */
 template <typename ElementType, typename IndexType>
-void row_normalize(const raft::handle_t& handle,
+void row_normalize(raft::device_resources const& handle,
                    raft::device_matrix_view<const ElementType, IndexType, row_major> in,
                    raft::device_matrix_view<ElementType, IndexType, row_major> out,
                    NormType norm_type,
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 59c2cdf314..1fdfcb3780 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ void power(out_t* out, const in_t* in1, const in_t* in2, IdxType len, cudaStream
  * @brief Elementwise power operation on the input buffers
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1    First Input
  * @param[in] in2    Second Input
  * @param[out] out    Output
@@ -82,7 +82,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void power(const raft::handle_t& handle, InType in1, InType in2, OutType out)
+void power(raft::device_resources const& handle, InType in1, InType in2, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -113,7 +113,7 @@ void power(const raft::handle_t& handle, InType in1, InType in2, OutType out)
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[out] out    Output
  * @param[in] scalar    raft::host_scalar_view
@@ -124,7 +124,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void power_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   const raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index 7c5c0ea628..8e58af63c1 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
+void qrGetQ(raft::device_resources const& handle,
             const math_t* M,
             math_t* Q,
             int n_rows,
@@ -54,7 +54,7 @@ void qrGetQ(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
+void qrGetQR(raft::device_resources const& handle,
              math_t* M,
              math_t* Q,
              math_t* R,
@@ -72,12 +72,12 @@ void qrGetQR(const raft::handle_t& handle,
 
 /**
  * @brief Compute the QR decomposition of matrix M and return only the Q matrix.
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M Input raft::device_matrix_view
  * @param[out] Q Output raft::device_matrix_view
  */
 template <typename ElementType, typename IndexType>
-void qr_get_q(const raft::handle_t& handle,
+void qr_get_q(raft::device_resources const& handle,
               raft::device_matrix_view<const ElementType, IndexType, raft::col_major> M,
               raft::device_matrix_view<ElementType, IndexType, raft::col_major> Q)
 {
@@ -88,13 +88,13 @@ void qr_get_q(const raft::handle_t& handle,
 
 /**
  * @brief Compute the QR decomposition of matrix M and return both the Q and R matrices.
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M Input raft::device_matrix_view
  * @param[in] Q Output raft::device_matrix_view
  * @param[out] R Output raft::device_matrix_view
  */
 template <typename ElementType, typename IndexType>
-void qr_get_qr(const raft::handle_t& handle,
+void qr_get_qr(raft::device_resources const& handle,
                raft::device_matrix_view<const ElementType, IndexType, raft::col_major> M,
                raft::device_matrix_view<ElementType, IndexType, raft::col_major> Q,
                raft::device_matrix_view<ElementType, IndexType, raft::col_major> R)
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 3eb8196408..ae5457c44f 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ void reduce(OutType* dots,
  * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
  * It must be a 'callable' supporting the following input and output:
  * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] data Input of type raft::device_matrix_view
  * @param[out] dots Output of type raft::device_matrix_view
  * @param[in] init initial value to use for the reduction
@@ -122,7 +122,7 @@ template <typename InElementType,
           typename MainLambda     = raft::identity_op,
           typename ReduceLambda   = raft::add_op,
           typename FinalLambda    = raft::identity_op>
-void reduce(const raft::handle_t& handle,
+void reduce(raft::device_resources const& handle,
             raft::device_matrix_view<const InElementType, IdxType, LayoutPolicy> data,
             raft::device_vector_view<OutElementType, IdxType> dots,
             OutElementType init,
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 7b0ad2f984..2b744d8134 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/reduce_cols_by_key.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -69,7 +69,7 @@ void reduce_cols_by_key(const T* data,
  * @tparam ElementType the input data type (as well as the output reduced matrix)
  * @tparam KeyType data type of the keys
  * @tparam IndexType indexing arithmetic type
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] data the input data (dim = nrows x ncols). This is assumed to be in
  * row-major layout of type raft::device_matrix_view
  * @param[in] keys keys raft::device_vector_view (len = ncols). It is assumed that each key in this
@@ -84,7 +84,7 @@ void reduce_cols_by_key(const T* data,
  */
 template <typename ElementType, typename KeyType = ElementType, typename IndexType = std::uint32_t>
 void reduce_cols_by_key(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ElementType, IndexType, raft::row_major> data,
   raft::device_vector_view<const KeyType, IndexType> keys,
   raft::device_matrix_view<ElementType, IndexType, raft::row_major> out,
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 1dabd92087..484b60238b 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/reduce_rows_by_key.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -136,7 +136,7 @@ void reduce_rows_by_key(const DataIteratorT d_A,
  * @tparam KeyType data-type of keys
  * @tparam WeightType data-type of weights
  * @tparam IndexType index type
- * @param[in]  handle      raft::handle_t
+ * @param[in]  handle      raft::device_resources
  * @param[in]  d_A         Input raft::device_mdspan (ncols * nrows)
  * @param[in]  d_keys      Keys for each row raft::device_vector_view (1 x nrows)
  * @param[out] d_sums      Row sums by key raft::device_matrix_view (ncols x d_keys)
@@ -148,7 +148,7 @@ void reduce_rows_by_key(const DataIteratorT d_A,
  */
 template <typename ElementType, typename KeyType, typename WeightType, typename IndexType>
 void reduce_rows_by_key(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ElementType, IndexType, raft::row_major> d_A,
   raft::device_vector_view<const KeyType, IndexType> d_keys,
   raft::device_matrix_view<ElementType, IndexType, raft::row_major> d_sums,
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index 6f0315642b..eb94547f13 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdFixedRank(const raft::handle_t& handle,
+void rsvdFixedRank(raft::device_resources const& handle,
                    math_t* M,
                    int n_rows,
                    int n_cols,
@@ -104,7 +104,7 @@ void rsvdFixedRank(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void rsvdPerc(const raft::handle_t& handle,
+void rsvdPerc(raft::device_resources const& handle,
               math_t* M,
               int n_rows,
               int n_cols,
@@ -154,7 +154,7 @@ void rsvdPerc(const raft::handle_t& handle,
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -164,7 +164,7 @@ void rsvdPerc(const raft::handle_t& handle,
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_fixed_rank(const raft::handle_t& handle,
+void rsvd_fixed_rank(raft::device_resources const& handle,
                      raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                      raft::device_vector_view<ValueType, IndexType> S_vec,
                      IndexType p,
@@ -228,7 +228,7 @@ void rsvd_fixed_rank(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -239,7 +239,7 @@ void rsvd_fixed_rank(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void rsvd_fixed_rank_symmetric(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
   raft::device_vector_view<ValueType, IndexType> S_vec,
   IndexType p,
@@ -303,7 +303,7 @@ void rsvd_fixed_rank_symmetric(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -315,7 +315,7 @@ void rsvd_fixed_rank_symmetric(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_fixed_rank_jacobi(const raft::handle_t& handle,
+void rsvd_fixed_rank_jacobi(raft::device_resources const& handle,
                             raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                             raft::device_vector_view<ValueType, IndexType> S_vec,
                             IndexType p,
@@ -381,7 +381,7 @@ void rsvd_fixed_rank_jacobi(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] p no. of upsamples
@@ -394,7 +394,7 @@ void rsvd_fixed_rank_jacobi(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void rsvd_fixed_rank_symmetric_jacobi(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
   raft::device_vector_view<ValueType, IndexType> S_vec,
   IndexType p,
@@ -460,7 +460,7 @@ void rsvd_fixed_rank_symmetric_jacobi(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -471,7 +471,7 @@ void rsvd_fixed_rank_symmetric_jacobi(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_perc(const raft::handle_t& handle,
+void rsvd_perc(raft::device_resources const& handle,
                raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                raft::device_vector_view<ValueType, IndexType> S_vec,
                ValueType PC_perc,
@@ -536,7 +536,7 @@ void rsvd_perc(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -547,7 +547,7 @@ void rsvd_perc(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_perc_symmetric(const raft::handle_t& handle,
+void rsvd_perc_symmetric(raft::device_resources const& handle,
                          raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                          raft::device_vector_view<ValueType, IndexType> S_vec,
                          ValueType PC_perc,
@@ -612,7 +612,7 @@ void rsvd_perc_symmetric(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -625,7 +625,7 @@ void rsvd_perc_symmetric(Args... args)
  * raft::col_major
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void rsvd_perc_jacobi(const raft::handle_t& handle,
+void rsvd_perc_jacobi(raft::device_resources const& handle,
                       raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
                       raft::device_vector_view<ValueType, IndexType> S_vec,
                       ValueType PC_perc,
@@ -692,7 +692,7 @@ void rsvd_perc_jacobi(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] M input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S_vec singular values raft::device_vector_view of shape (K)
  * @param[in] PC_perc percentage of singular values to be computed
@@ -706,7 +706,7 @@ void rsvd_perc_jacobi(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void rsvd_perc_symmetric_jacobi(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> M,
   raft::device_vector_view<ValueType, IndexType> S_vec,
   ValueType PC_perc,
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index ad6cad2eb2..55e661897d 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ void sqrt(out_t* out, const in_t* in, IdxType len, cudaStream_t stream)
  * @brief Elementwise sqrt operation
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in     Input
  * @param[out] out    Output
  */
@@ -59,7 +59,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void sqrt(const raft::handle_t& handle, InType in, OutType out)
+void sqrt(raft::device_resources const& handle, InType in, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index d9c26910e7..d282a2e1fa 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ void stridedReduction(OutType* dots,
  * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
  * It must be a 'callable' supporting the following input and output:
  * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] data Input of type raft::device_matrix_view
  * @param[out] dots Output of type raft::device_matrix_view
  * @param[in] init initial value to use for the reduction
@@ -128,7 +128,7 @@ template <typename InValueType,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void strided_reduction(const raft::handle_t& handle,
+void strided_reduction(raft::device_resources const& handle,
                        raft::device_matrix_view<const InValueType, IndexType, LayoutPolicy> data,
                        raft::device_vector_view<OutValueType, IndexType> dots,
                        OutValueType init,
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index e6f2fa8724..da995b7a2a 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ void subtractDevScalar(math_t* outDev,
  * @brief Elementwise subtraction operation on the input buffers
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
- * @param handle raft::handle_t
+ * @param handle raft::device_resources
  * @param[in] in1    First Input
  * @param[in] in2    Second Input
  * @param[out] out    Output
@@ -106,7 +106,7 @@ template <typename InType,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void subtract(const raft::handle_t& handle, InType in1, InType in2, OutType out)
+void subtract(raft::device_resources const& handle, InType in1, InType in2, OutType out)
 {
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
@@ -137,7 +137,7 @@ void subtract(const raft::handle_t& handle, InType in1, InType in2, OutType out)
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[out] out    Output
  * @param[in] scalar    raft::device_scalar_view
@@ -148,7 +148,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void subtract_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   raft::device_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
@@ -182,7 +182,7 @@ void subtract_scalar(
  * @tparam InType    Input Type raft::device_mdspan
  * @tparam OutType   Output Type raft::device_mdspan
  * @tparam ScalarIdxType Index Type of scalar
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in    Input
  * @param[out] out    Output
  * @param[in] scalar    raft::host_scalar_view
@@ -193,7 +193,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void subtract_scalar(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   InType in,
   OutType out,
   raft::host_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 2c1b5a5cb7..eb51093240 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename T>
-void svdQR(const raft::handle_t& handle,
+void svdQR(raft::device_resources const& handle,
            T* in,
            int n_rows,
            int n_cols,
@@ -67,7 +67,7 @@ void svdQR(const raft::handle_t& handle,
 }
 
 template <typename math_t, typename idx_t>
-void svdEig(const raft::handle_t& handle,
+void svdEig(raft::device_resources const& handle,
             math_t* in,
             idx_t n_rows,
             idx_t n_cols,
@@ -98,7 +98,7 @@ void svdEig(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
+void svdJacobi(raft::device_resources const& handle,
                math_t* in,
                int n_rows,
                int n_cols,
@@ -139,7 +139,7 @@ void svdJacobi(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
+void svdReconstruction(raft::device_resources const& handle,
                        math_t* U,
                        math_t* S,
                        math_t* V,
@@ -167,7 +167,7 @@ void svdReconstruction(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+bool evaluateSVDByL2Norm(raft::device_resources const& handle,
                          math_t* A_d,
                          math_t* U,
                          math_t* S_vec,
@@ -195,7 +195,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] sing_vals singular values raft::device_vector_view of shape (K)
  * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
@@ -204,7 +204,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
  * layout raft::col_major and dimensions (n, n)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
-void svd_qr(const raft::handle_t& handle,
+void svd_qr(raft::device_resources const& handle,
             raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
             raft::device_vector_view<ValueType, IndexType> sing_vals,
             UType&& U_in,
@@ -258,7 +258,7 @@ void svd_qr(Args... args)
  * U_in
  * @tparam VType std::optional<raft::device_matrix_view<ValueType, IndexType, raft::col_major>> @c
  * V_in
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] sing_vals singular values raft::device_vector_view of shape (K)
  * @param[out] U_in std::optional left singular values of raft::device_matrix_view with layout
@@ -268,7 +268,7 @@ void svd_qr(Args... args)
  */
 template <typename ValueType, typename IndexType, typename UType, typename VType>
 void svd_qr_transpose_right_vec(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
   raft::device_vector_view<ValueType, IndexType> sing_vals,
   UType&& U_in,
@@ -316,7 +316,7 @@ void svd_qr_transpose_right_vec(Args... args)
 /**
  * @brief singular value decomposition (SVD) on a column major
  * matrix using Eigen decomposition. A square symmetric covariance matrix is constructed for the SVD
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in input raft::device_matrix_view with layout raft::col_major of shape (M, N)
  * @param[out] S singular values raft::device_vector_view of shape (K)
  * @param[out] V right singular values of raft::device_matrix_view with layout
@@ -326,7 +326,7 @@ void svd_qr_transpose_right_vec(Args... args)
  */
 template <typename ValueType, typename IndexType>
 void svd_eig(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const ValueType, IndexType, raft::col_major> in,
   raft::device_vector_view<ValueType, IndexType> S,
   raft::device_matrix_view<ValueType, IndexType, raft::col_major> V,
@@ -352,7 +352,7 @@ void svd_eig(
 /**
  * @brief reconstruct a matrix use left and right singular vectors and
  * singular values
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] U left singular values of raft::device_matrix_view with layout
  * raft::col_major and dimensions (m, k)
  * @param[in] S singular values raft::device_vector_view of shape (k, k)
@@ -361,7 +361,7 @@ void svd_eig(
  * @param[out] out output raft::device_matrix_view with layout raft::col_major of shape (m, n)
  */
 template <typename ValueType, typename IndexType>
-void svd_reconstruction(const raft::handle_t& handle,
+void svd_reconstruction(raft::device_resources const& handle,
                         raft::device_matrix_view<const ValueType, IndexType, raft::col_major> U,
                         raft::device_vector_view<const ValueType, IndexType> S,
                         raft::device_matrix_view<const ValueType, IndexType, raft::col_major> V,
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index 10e91a0313..aa3859bc23 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include "detail/ternary_op.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -63,7 +63,7 @@ void ternaryOp(out_t* out,
  * @tparam InType Input Type raft::device_mdspan
  * @tparam Lambda the device-lambda performing the actual operation
  * @tparam OutType Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
+ * @param[in] handle raft::device_resources
  * @param[in] in1 First input
  * @param[in] in2 Second input
  * @param[in] in3 Third input
@@ -78,7 +78,7 @@ template <typename InType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void ternary_op(
-  const raft::handle_t& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
+  raft::device_resources const& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh
index 608a87b489..a0f418b4f7 100644
--- a/cpp/include/raft/linalg/transpose.cuh
+++ b/cpp/include/raft/linalg/transpose.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(const raft::handle_t& handle,
+void transpose(raft::device_resources const& handle,
                math_t* in,
                math_t* out,
                int n_rows,
@@ -76,7 +76,7 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
  * @param[out] out    Output matirx, storage is pre-allocated by caller.
  */
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
-auto transpose(handle_t const& handle,
+auto transpose(raft::device_resources const& handle,
                raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
                raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
   -> std::enable_if_t<std::is_floating_point_v<T>, void>
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index a90bda06d5..ce102adfd2 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "detail/unary_op.cuh"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -30,17 +30,16 @@ namespace linalg {
 /**
  * @brief perform element-wise unary operation in the input array
  * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam Lambda Device lambda performing the actual operation, with the signature
+ *         `OutType func(const InType& val);`
  * @tparam OutType output data-type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
+ * @param[out] out    Output array [on device], dim = [len]
+ * @param[in]  in     Input array [on device], dim = [len]
+ * @param[in]  len    Number of elements in the input array
+ * @param[in]  op     Device lambda
+ * @param[in]  stream cuda stream where to launch work
  */
 template <typename InType,
           typename Lambda,
@@ -58,15 +57,15 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_
  * Compared to `unaryOp()`, this method does not do any reads from any inputs
  *
  * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam Lambda  Device lambda performing the actual operation, with the signature
+ *                 `void func(OutType* outLocationOffset, IdxType idx);`
+ *                 where outLocationOffset will be out + idx.
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB     threads-per-block in the final kernel launched
  *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
+ * @param[out] out    Output array [on device], dim = [len]
+ * @param[in]  len    Number of elements in the input array
+ * @param[in]  op     Device lambda
  * @param[in]  stream cuda stream where to launch work
  */
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
@@ -81,23 +80,22 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
  */
 
 /**
- * @brief perform element-wise binary operation on the input arrays
+ * @brief Perform an element-wise unary operation into the output array
  * @tparam InType Input Type raft::device_mdspan
- * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam Lambda Device lambda performing the actual operation, with the signature
+ *                `out_value_t func(const in_value_t& val);`
  * @tparam OutType Output Type raft::device_mdspan
- * @param[in] handle raft::handle_t
- * @param[in] in Input
- * @param[out] out Output
- * @param[in] op the device-lambda
- * @note Lambda must be a functor with the following signature:
- *       `InType func(const InType& val);`
+ * @param[in]  handle The raft handle
+ * @param[in]  in     Input
+ * @param[out] out    Output
+ * @param[in]  op     Device lambda
  */
 template <typename InType,
           typename Lambda,
           typename OutType,
           typename = raft::enable_if_input_device_mdspan<InType>,
           typename = raft::enable_if_output_device_mdspan<OutType>>
-void unary_op(const raft::handle_t& handle, InType in, OutType out, Lambda op)
+void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
@@ -116,29 +114,32 @@ void unary_op(const raft::handle_t& handle, InType in, OutType out, Lambda op)
 }
 
 /**
- * @brief perform element-wise binary operation on the input arrays
- * This function does not read from the input
- * @tparam InType Input Type raft::device_mdspan
- * @tparam Lambda the device-lambda performing the actual operation
- * @param[in] handle raft::handle_t
- * @param[inout] in Input/Output
- * @param[in] op the device-lambda
- * @note Lambda must be a functor with the following signature:
- *       `InType func(const InType& val);`
+ * @brief Perform an element-wise unary operation on the input index into the output array
+ *
+ * @note This operation is deprecated. Please use map_offset in `raft/linalg/map.cuh` instead.
+ *
+ * @tparam OutType Output Type raft::device_mdspan
+ * @tparam Lambda  Device lambda performing the actual operation, with the signature
+ *                 `void func(out_value_t* out_location, index_t idx);`
+ * @param[in]  handle The raft handle
+ * @param[out] out    Output
+ * @param[in]  op     Device lambda
  */
-template <typename InType, typename Lambda, typename = raft::enable_if_output_device_mdspan<InType>>
-void write_only_unary_op(const raft::handle_t& handle, InType in, Lambda op)
+template <typename OutType,
+          typename Lambda,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
 
-  using in_value_t = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
 
-  if (in.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    writeOnlyUnaryOp<in_value_t, Lambda, std::uint32_t>(
-      in.data_handle(), in.size(), op, handle.get_stream());
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint32_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
   } else {
-    writeOnlyUnaryOp<in_value_t, Lambda, std::uint64_t>(
-      in.data_handle(), in.size(), op, handle.get_stream());
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint64_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
   }
 }
 
@@ -147,4 +148,4 @@ void write_only_unary_op(const raft::handle_t& handle, InType in, Lambda op)
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index a614f7043f..433c161079 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace raft::matrix {
  * @param[out] out: output vector of size n_rows
  */
 template <typename math_t, typename idx_t, typename matrix_idx_t>
-void argmax(const raft::handle_t& handle,
+void argmax(raft::device_resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
diff --git a/cpp/include/raft/matrix/argmin.cuh b/cpp/include/raft/matrix/argmin.cuh
index ca7b0252d2..31ef0c1c1b 100644
--- a/cpp/include/raft/matrix/argmin.cuh
+++ b/cpp/include/raft/matrix/argmin.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace raft::matrix {
  * @param[out] out: output vector of size n_rows
  */
 template <typename math_t, typename idx_t, typename matrix_idx_t>
-void argmin(const raft::handle_t& handle,
+void argmin(raft::device_resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index 662f62d865..a4daf097e5 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ void sort_cols_per_row(const InType* in,
  * @param[out] sorted_keys_opt: std::optional, output matrix for sorted keys (input)
  */
 template <typename in_t, typename out_t, typename matrix_idx_t, typename sorted_keys_t>
-void sort_cols_per_row(const raft::handle_t& handle,
+void sort_cols_per_row(raft::device_resources const& handle,
                        raft::device_matrix_view<const in_t, matrix_idx_t, raft::row_major> in,
                        raft::device_matrix_view<out_t, matrix_idx_t, raft::row_major> out,
                        sorted_keys_t&& sorted_keys_opt)
diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
index 0727fac246..42d2562e5e 100644
--- a/cpp/include/raft/matrix/copy.cuh
+++ b/cpp/include/raft/matrix/copy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ namespace raft::matrix {
  * @param[in] indices of the rows to be copied
  */
 template <typename m_t, typename idx_t, typename layout>
-void copy_rows(const raft::handle_t& handle,
+void copy_rows(raft::device_resources const& handle,
                raft::device_matrix_view<const m_t, idx_t, layout> in,
                raft::device_matrix_view<m_t, idx_t, layout> out,
                raft::device_vector_view<idx_t, idx_t> indices)
@@ -65,7 +65,7 @@ void copy_rows(const raft::handle_t& handle,
  * @param[out] out: output matrix
  */
 template <typename m_t, typename matrix_idx_t>
-void copy(const raft::handle_t& handle,
+void copy(raft::device_resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, col_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, col_major> out)
 {
@@ -84,7 +84,7 @@ void copy(const raft::handle_t& handle,
  * @param out: output matrix
  */
 template <typename m_t, typename idx_t>
-void trunc_zero_origin(const raft::handle_t& handle,
+void trunc_zero_origin(raft::device_resources const& handle,
                        raft::device_matrix_view<const m_t, idx_t, col_major> in,
                        raft::device_matrix_view<m_t, idx_t, col_major> out)
 {
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index c006f69e47..f6dc60bf85 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,41 +17,63 @@
 #pragma once
 
 #include <raft/core/operators.hpp>
+#include <raft/cudart_utils.h>
 
 namespace raft {
 namespace matrix {
 namespace detail {
 
-// gatherKernel conditionally copies rows from the source matrix 'in' into the destination matrix
-// 'out' according to a map (or a transformed map)
-template <typename MatrixIteratorT,
+/** Tiling policy for the gather kernel.
+ *
+ * The output matrix is considered as a flattened array, an approach that provides much better
+ * performance than 1 row per block when D is small. Additionally, each thread works on multiple
+ * output elements using an unrolled loop (approx. 30% faster than working on a single element)
+ */
+template <int tpb, int wpt>
+struct gather_policy {
+  static constexpr int n_threads       = tpb;
+  static constexpr int work_per_thread = wpt;
+  static constexpr int stride          = tpb * wpt;
+};
+
+/** Conditionally copies rows from the source matrix 'in' into the destination matrix
+ * 'out' according to a map (or a transformed map) */
+template <typename Policy,
+          typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
-          int TPB,
           typename PredicateOp,
           typename MapTransformOp,
-          typename IndexT = int>
-__global__ void gatherKernel(const MatrixIteratorT in,
-                             IndexT D,
-                             IndexT N,
-                             MapIteratorT map,
-                             StencilIteratorT stencil,
-                             MatrixIteratorT out,
-                             PredicateOp pred_op,
-                             MapTransformOp transform_op)
+          typename OutputIteratorT,
+          typename IndexT>
+__global__ void gather_kernel(const InputIteratorT in,
+                              IndexT D,
+                              IndexT len,
+                              const MapIteratorT map,
+                              StencilIteratorT stencil,
+                              OutputIteratorT out,
+                              PredicateOp pred_op,
+                              MapTransformOp transform_op)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
   typedef typename std::iterator_traits<StencilIteratorT>::value_type StencilValueT;
 
-  IndexT outRowStart        = blockIdx.x * D;
-  MapValueT map_val         = map[blockIdx.x];
-  StencilValueT stencil_val = stencil[blockIdx.x];
+#pragma unroll
+  for (IndexT wid = 0; wid < Policy::work_per_thread; wid++) {
+    IndexT tid = threadIdx.x + (Policy::work_per_thread * static_cast<IndexT>(blockIdx.x) + wid) *
+                                 Policy::n_threads;
+    if (tid < len) {
+      IndexT i_dst = tid / D;
+      IndexT j     = tid % D;
+
+      MapValueT map_val         = map[i_dst];
+      StencilValueT stencil_val = stencil[i_dst];
 
-  bool predicate = pred_op(stencil_val);
-  if (predicate) {
-    IndexT inRowStart = transform_op(map_val) * D;
-    for (int i = threadIdx.x; i < D; i += TPB) {
-      out[outRowStart + i] = in[inRowStart + i];
+      bool predicate = pred_op(stencil_val);
+      if (predicate) {
+        IndexT i_src = transform_op(map_val);
+        out[tid]     = in[i_src * D + j];
+      }
     }
   }
 }
@@ -60,7 +82,7 @@ __global__ void gatherKernel(const MatrixIteratorT in,
  * @brief  gather conditionally copies rows from a source matrix into a destination matrix according
  * to a transformed map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
@@ -69,7 +91,10 @@ __global__ void gatherKernel(const MatrixIteratorT in,
  * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
  * type must be convertible to bool type.
  * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * type must be convertible to IndexT.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -83,18 +108,20 @@ __global__ void gatherKernel(const MatrixIteratorT in,
  * @param  transform_op The transformation operation, transforms the map values to IndexT
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
           typename UnaryPredicateOp,
-          typename MapTransformOp>
-void gatherImpl(const MatrixIteratorT in,
-                int D,
-                int N,
-                MapIteratorT map,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gatherImpl(const InputIteratorT in,
+                IndexT D,
+                IndexT N,
+                const MapIteratorT map,
                 StencilIteratorT stencil,
-                int map_length,
-                MatrixIteratorT out,
+                IndexT map_length,
+                OutputIteratorT out,
                 UnaryPredicateOp pred_op,
                 MapTransformOp transform_op,
                 cudaStream_t stream)
@@ -102,9 +129,6 @@ void gatherImpl(const MatrixIteratorT in,
   // skip in case of 0 length input
   if (map_length <= 0 || N <= 0 || D <= 0) return;
 
-  // signed integer type for indexing or global offsets
-  typedef int IndexT;
-
   // map value type
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
 
@@ -121,38 +145,26 @@ void gatherImpl(const MatrixIteratorT in,
   static_assert((std::is_convertible<PredicateOpReturnT, bool>::value),
                 "UnaryPredicateOp's result type must be convertible to bool type");
 
-  if (D <= 32) {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 32,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 32, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
-  } else if (D <= 64) {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 64,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 64, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
-  } else if (D <= 128) {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 128,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 128, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
+  IndexT len        = map_length * D;
+  constexpr int TPB = 128;
+  const int n_sm    = raft::getMultiProcessorCount();
+  // The following empirical heuristics enforce that we keep a good balance between having enough
+  // blocks and enough work per thread.
+  if (len < static_cast<IndexT>(32 * TPB * n_sm)) {
+    using Policy    = gather_policy<TPB, 1>;
+    IndexT n_blocks = raft::ceildiv(map_length * D, static_cast<IndexT>(Policy::stride));
+    gather_kernel<Policy><<<n_blocks, Policy::n_threads, 0, stream>>>(
+      in, D, len, map, stencil, out, pred_op, transform_op);
+  } else if (len < static_cast<IndexT>(32 * 4 * TPB * n_sm)) {
+    using Policy    = gather_policy<TPB, 4>;
+    IndexT n_blocks = raft::ceildiv(map_length * D, static_cast<IndexT>(Policy::stride));
+    gather_kernel<Policy><<<n_blocks, Policy::n_threads, 0, stream>>>(
+      in, D, len, map, stencil, out, pred_op, transform_op);
   } else {
-    gatherKernel<MatrixIteratorT,
-                 MapIteratorT,
-                 StencilIteratorT,
-                 256,
-                 UnaryPredicateOp,
-                 MapTransformOp>
-      <<<map_length, 256, 0, stream>>>(in, D, N, map, stencil, out, pred_op, transform_op);
+    using Policy    = gather_policy<TPB, 8>;
+    IndexT n_blocks = raft::ceildiv(map_length * D, static_cast<IndexT>(Policy::stride));
+    gather_kernel<Policy><<<n_blocks, Policy::n_threads, 0, stream>>>(
+      in, D, len, map, stencil, out, pred_op, transform_op);
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
@@ -160,10 +172,13 @@ void gatherImpl(const MatrixIteratorT in,
 /**
  * @brief  gather copies rows from a source matrix into a destination matrix according to a map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -174,13 +189,13 @@ void gatherImpl(const MatrixIteratorT in,
  * @param  out          Pointer to the output matrix (assumed to be row-major)
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT, typename MapIteratorT, typename OutputIteratorT, typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             cudaStream_t stream)
 {
   typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
@@ -192,12 +207,15 @@ void gather(const MatrixIteratorT in,
  * @brief  gather copies rows from a source matrix into a destination matrix according to a
  * transformed map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
  * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * type must be convertible to IndexT.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -209,13 +227,17 @@ void gather(const MatrixIteratorT in,
  * @param  transform_op The transformation operation, transforms the map values to IndexT
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT, typename MapTransformOp>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT,
+          typename MapIteratorT,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             MapTransformOp transform_op,
             cudaStream_t stream)
 {
@@ -227,7 +249,7 @@ void gather(const MatrixIteratorT in,
  * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
  * according to a map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
@@ -235,6 +257,9 @@ void gather(const MatrixIteratorT in,
  * simple pointer type).
  * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
  * type must be convertible to bool type.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -247,17 +272,19 @@ void gather(const MatrixIteratorT in,
  * @param  pred_op      Predicate to apply to the stencil values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
-          typename UnaryPredicateOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename UnaryPredicateOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                cudaStream_t stream)
 {
@@ -269,7 +296,7 @@ void gather_if(const MatrixIteratorT in,
  * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
  * according to a transformed map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
+ * @tparam InputIteratorT       Random-access iterator type, for reading input matrix (may be a
  * simple pointer type).
  * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
  * pointer type).
@@ -278,7 +305,10 @@ void gather_if(const MatrixIteratorT in,
  * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
  * type must be convertible to bool type.
  * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * type must be convertible to IndexT type.
+ * @tparam OutputIteratorT      Random-access iterator type, for writing output matrix (may be a
+ * simple pointer type).
+ * @tparam IndexT               Index type.
  *
  * @param  in           Pointer to the input matrix (assumed to be row-major)
  * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
@@ -292,18 +322,20 @@ void gather_if(const MatrixIteratorT in,
  * @param  transform_op The transformation operation, transforms the map values to IndexT
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
           typename UnaryPredicateOp,
-          typename MapTransformOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                MapTransformOp transform_op,
                cudaStream_t stream)
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 605726bea6..ef8f0e88c1 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -796,7 +796,8 @@ struct MatrixLinewiseOp {
                   "layout for in and out must be either padded row or col major");
 
     // also statically assert padded matrix alignment == 2^i*VecBytes
-    assert(raft::Pow2<VecBytes>::areSameAlignOffsets(in, out));
+    RAFT_EXPECTS(raft::Pow2<VecBytes>::areSameAlignOffsets(in.data_handle(), out.data_handle()),
+                 "The matrix views in and out does not have correct alignment");
 
     if (alongLines)
       return matrixLinewiseVecRowsSpan<Type,
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index c559da3942..96398e9c74 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <cub/cub.cuh>
 #include <raft/core/operators.hpp>
@@ -87,10 +87,10 @@ void seqRoot(math_t* in,
         if (a < math_t(0)) {
           return math_t(0);
         } else {
-          return sqrt(a * scalar);
+          return raft::sqrt(a * scalar);
         }
       } else {
-        return sqrt(a * scalar);
+        return raft::sqrt(a * scalar);
       }
     },
     stream);
@@ -194,7 +194,7 @@ void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_
 
 template <typename math_t, typename IdxType = int>
 void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+  raft::device_resources const& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
   auto d_src  = src;
   auto d_dest = dest;
@@ -278,7 +278,7 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       rowMajor,
       bcastAlongRows,
       [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
+        if (raft::abs(b) < Type(1e-10))
           return Type(0);
         else
           return a / b;
@@ -294,7 +294,7 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       rowMajor,
       bcastAlongRows,
       [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
+        if (raft::abs(b) < Type(1e-10))
           return a;
         else
           return a / b;
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 17a40be5d6..ef3a873d90 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 #include <cstddef>
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -299,7 +299,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 }
 
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, const m_t* in, idx_t size, cudaStream_t stream)
+m_t getL2Norm(raft::device_resources const& handle, const m_t* in, idx_t size, cudaStream_t stream)
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
diff --git a/cpp/include/raft/matrix/detail/print.hpp b/cpp/include/raft/matrix/detail/print.hpp
index fc3d14861c..814c6a0b4b 100644
--- a/cpp/include/raft/matrix/detail/print.hpp
+++ b/cpp/include/raft/matrix/detail/print.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 #include <cstddef>
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
similarity index 59%
rename from cpp/include/raft/spatial/knn/detail/topk.cuh
rename to cpp/include/raft/matrix/detail/select_k.cuh
index f4dcb53088..ac1ba3dfa3 100644
--- a/cpp/include/raft/spatial/knn/detail/topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,34 +16,34 @@
 
 #pragma once
 
-#include "topk/radix_topk.cuh"
-#include "topk/warpsort_topk.cuh"
+#include "select_radix.cuh"
+#include "select_warpsort.cuh"
 
 #include <raft/core/nvtx.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn::detail {
+namespace raft::matrix::detail {
 
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
- * If you think of the input data `in_keys` as a row-major matrix with len columns and
- * batch_size rows, then this function selects k smallest/largest values in each row and fills
- * in the row-major matrix `out` of size (batch_size, k).
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
  *
  * @tparam T
  *   the type of the keys (what is being compared).
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
- * @param[in] in
+ * @param[in] in_val
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
- *   typically, these are indices of the corresponding in_keys.
+ *   typically, these are indices of the corresponding in_val.
  * @param batch_size
  *   number of input rows, i.e. the batch size.
  * @param len
@@ -51,12 +51,12 @@ namespace raft::spatial::knn::detail {
  *   Invariant: len >= k.
  * @param k
  *   the number of outputs to select in each input row.
- * @param[out] out
+ * @param[out] out_val
  *   contiguous device array of outputs of size (k * batch_size);
- *   the k smallest/largest values from each row of the `in_keys`.
+ *   the k smallest/largest values from each row of the `in_val`.
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
- *   the payload selected together with `out`.
+ *   the payload selected together with `out_val`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
  * @param stream
@@ -64,28 +64,28 @@ namespace raft::spatial::knn::detail {
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void select_topk(const T* in,
-                 const IdxT* in_idx,
-                 size_t batch_size,
-                 size_t len,
-                 int k,
-                 T* out,
-                 IdxT* out_idx,
-                 bool select_min,
-                 rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "matrix::select_topk(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
   // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
   const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
-  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity && !radix_faster) {
-    topk::warp_sort_topk<T, IdxT>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
+  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
+    select::warpsort::select_k<T, IdxT>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   } else {
-    topk::radix_topk<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
+    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   }
 }
 
-}  // namespace raft::spatial::knn::detail
+}  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
similarity index 87%
rename from cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
rename to cpp/include/raft/matrix/detail/select_radix.cuh
index 9c0f20b706..de19e63a4c 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/pow2_utils.cuh>
@@ -27,29 +28,29 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
 
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::matrix::detail::select::radix {
 
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
 
 template <int BitsPerPass>
-__host__ __device__ constexpr int calc_num_buckets()
+_RAFT_HOST_DEVICE constexpr int calc_num_buckets()
 {
   return 1 << BitsPerPass;
 }
 
 template <typename T, int BitsPerPass>
-__host__ __device__ constexpr int calc_num_passes()
+_RAFT_HOST_DEVICE constexpr int calc_num_passes()
 {
   return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
 }
 
 // Minimum reasonable block size for the given radix size.
 template <int BitsPerPass>
-__host__ __device__ constexpr int calc_min_block_size()
+_RAFT_HOST_DEVICE constexpr int calc_min_block_size()
 {
   return 1 << std::max<int>(BitsPerPass - 4, Pow2<WarpSize>::Log2 + 1);
 }
@@ -62,7 +63,7 @@ __host__ __device__ constexpr int calc_min_block_size()
  * NB: Use pass=-1 for calc_mask().
  */
 template <typename T, int BitsPerPass>
-__device__ constexpr int calc_start_bit(int pass)
+_RAFT_DEVICE constexpr int calc_start_bit(int pass)
 {
   int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BitsPerPass;
   if (start_bit < 0) { start_bit = 0; }
@@ -70,7 +71,7 @@ __device__ constexpr int calc_start_bit(int pass)
 }
 
 template <typename T, int BitsPerPass>
-__device__ constexpr unsigned calc_mask(int pass)
+_RAFT_DEVICE constexpr unsigned calc_mask(int pass)
 {
   static_assert(BitsPerPass <= 31);
   int num_bits = calc_start_bit<T, BitsPerPass>(pass - 1) - calc_start_bit<T, BitsPerPass>(pass);
@@ -82,7 +83,7 @@ __device__ constexpr unsigned calc_mask(int pass)
  * as of integers.
  */
 template <typename T>
-__device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+_RAFT_DEVICE typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 {
   auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
   bits      = cub::Traits<T>::TwiddleIn(bits);
@@ -91,7 +92,7 @@ __device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 }
 
 template <typename T, int BitsPerPass>
-__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+_RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
 {
   static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
   return (twiddle_in(x, greater) >> start_bit) & mask;
@@ -112,7 +113,7 @@ __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
  * @param f the lambda taking two arguments (T x, IdxT idx)
  */
 template <typename T, typename IdxT, typename Func>
-__device__ void vectorized_process(const T* in, IdxT len, Func f)
+_RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f)
 {
   const IdxT stride = blockDim.x * gridDim.x;
   const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
@@ -167,18 +168,18 @@ struct Counter {
  * (see steps 4-1 in `radix_kernel` description).
  */
 template <typename T, typename IdxT, int BitsPerPass>
-__device__ void filter_and_histogram(const T* in_buf,
-                                     const IdxT* in_idx_buf,
-                                     T* out_buf,
-                                     IdxT* out_idx_buf,
-                                     T* out,
-                                     IdxT* out_idx,
-                                     IdxT len,
-                                     Counter<T, IdxT>* counter,
-                                     IdxT* histogram,
-                                     bool greater,
-                                     int pass,
-                                     int k)
+_RAFT_DEVICE void filter_and_histogram(const T* in_buf,
+                                       const IdxT* in_idx_buf,
+                                       T* out_buf,
+                                       IdxT* out_idx_buf,
+                                       T* out,
+                                       IdxT* out_idx,
+                                       IdxT len,
+                                       Counter<T, IdxT>* counter,
+                                       IdxT* histogram,
+                                       bool greater,
+                                       int pass,
+                                       int k)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ IdxT histogram_smem[num_buckets];
@@ -260,10 +261,10 @@ __device__ void filter_and_histogram(const T* in_buf,
  * (step 2 in `radix_kernel` description)
  */
 template <typename IdxT, int BitsPerPass, int BlockSize>
-__device__ void scan(volatile IdxT* histogram,
-                     const int start,
-                     const int num_buckets,
-                     const IdxT current)
+_RAFT_DEVICE void scan(volatile IdxT* histogram,
+                       const int start,
+                       const int num_buckets,
+                       const IdxT current)
 {
   typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
   __shared__ typename BlockScan::TempStorage temp_storage;
@@ -284,7 +285,7 @@ __device__ void scan(volatile IdxT* histogram,
  *  (steps 2-3 in `radix_kernel` description)
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-__device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
+_RAFT_DEVICE void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   int index                 = threadIdx.x;
@@ -547,21 +548,21 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-void radix_topk(const T* in,
-                const IdxT* in_idx,
-                size_t batch_size,
-                size_t len,
-                int k,
-                T* out,
-                IdxT* out_idx,
-                bool select_min,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   // reduce the block size if the input length is too small.
   if constexpr (BlockSize > calc_min_block_size<BitsPerPass>()) {
     if (BlockSize * ITEM_PER_THREAD > len) {
-      return radix_topk<T, IdxT, BitsPerPass, BlockSize / 2>(
+      return select_k<T, IdxT, BitsPerPass, BlockSize / 2>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     }
   }
@@ -573,23 +574,33 @@ void radix_topk(const T* in,
   dim3 blocks           = get_optimal_grid_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, len);
   size_t max_chunk_size = blocks.y;
 
-  auto pool_guard = raft::get_pool_memory_resource(
-    mr,
-    max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
-                      + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
-                      + sizeof(T) * 2                     // T bufs
-                      ));
+  size_t req_aux = max_chunk_size * (sizeof(Counter<T, IdxT>) + num_buckets * sizeof(IdxT));
+  size_t req_buf = max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT));
+  size_t mem_req = req_aux + req_buf;
+  size_t mem_free, mem_total;
+  RAFT_CUDA_TRY(cudaMemGetInfo(&mem_free, &mem_total));
+  std::optional<rmm::mr::managed_memory_resource> managed_memory;
+  rmm::mr::device_memory_resource* mr_buf = nullptr;
+  if (mem_req > mem_free) {
+    // if there's not enough memory for buffers on the device, resort to the managed memory.
+    mem_req = req_aux;
+    managed_memory.emplace();
+    mr_buf = &managed_memory.value();
+  }
+
+  auto pool_guard = raft::get_pool_memory_resource(mr, mem_req);
   if (pool_guard) {
-    RAFT_LOG_DEBUG("radix_topk: using pool memory resource with initial size %zu bytes",
+    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
+  if (mr_buf == nullptr) { mr_buf = mr; }
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
+  rmm::device_uvector<T> buf1(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<T> buf2(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * len, stream, mr_buf);
 
   for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
     blocks.y = std::min(max_chunk_size, batch_size - offset);
@@ -646,4 +657,4 @@ void radix_topk(const T* in,
   }
 }
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::matrix::detail::select::radix
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
similarity index 71%
rename from cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
rename to cpp/include/raft/matrix/detail/select_warpsort.cuh
index cbe9f36e97..d362b73792 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include "bitonic_sort.cuh"
-
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
 #include <algorithm>
@@ -31,12 +32,12 @@
 
 /*
   Three APIs of different scopes are provided:
-    1. host function: warp_sort_topk()
+    1. host function: select_k()
     2. block-wide API: class block_sort
     3. warp-wide API: several implementations of warp_sort_*
 
 
-  1. warp_sort_topk()
+  1. select_k()
     (see the docstring)
 
   2. class block_sort
@@ -74,7 +75,7 @@
     These two classes can be regarded as fixed size priority queue for a warp.
     Usage is similar to class block_sort. No shared memory is needed.
 
-    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for
+    The host function (select_k) uses a heuristic to choose between these two classes for
     sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
     (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 
@@ -94,7 +95,7 @@
       }
  */
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::matrix::detail::select::warpsort {
 
 static constexpr int kMaxCapacity = 256;
 
@@ -102,18 +103,12 @@ namespace {
 
 /** Whether 'left` should indeed be on the left w.r.t. `right`. */
 template <bool Ascending, typename T>
-__device__ __forceinline__ auto is_ordered(T left, T right) -> bool
+_RAFT_DEVICE _RAFT_FORCEINLINE auto is_ordered(T left, T right) -> bool
 {
   if constexpr (Ascending) { return left < right; }
   if constexpr (!Ascending) { return left > right; }
 }
 
-constexpr auto calc_capacity(int k) -> int
-{
-  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
-  return capacity;
-}
-
 }  // namespace
 
 /**
@@ -134,7 +129,7 @@ constexpr auto calc_capacity(int k) -> int
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
 class warp_sort {
-  static_assert(isPo2(Capacity));
+  static_assert(is_a_power_of_two(Capacity));
   static_assert(std::is_default_constructible_v<IdxT>);
 
  public:
@@ -148,13 +143,16 @@ class warp_sort {
   /** The number of elements to select. */
   const int k;
 
+  /** Extra memory required per-block for keeping the state (shared or global). */
+  constexpr static auto mem_required(uint32_t block_size) -> size_t { return 0; }
+
   /**
    * Construct the warp_sort empty queue.
    *
    * @param k
    *   number of elements to select.
    */
-  __device__ warp_sort(int k) : k(k)
+  _RAFT_DEVICE warp_sort(int k) : k(k)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -182,7 +180,7 @@ class warp_sort {
    *    It serves as a conditional; when `false` the function does nothing.
    *    We need it to ensure threads within a full warp don't diverge calling `bitonic::merge()`.
    */
-  __device__ void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true)
+  _RAFT_DEVICE void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true)
   {
     if (do_merge) {
       int idx = Pow2<kWarpWidth>::mod(laneId()) ^ Pow2<kWarpWidth>::Mask;
@@ -198,7 +196,7 @@ class warp_sort {
       }
     }
     if (kWarpWidth < WarpSize || do_merge) {
-      topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+      util::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
     }
   }
 
@@ -211,14 +209,23 @@ class warp_sort {
    * @param[out] out_idx
    *   device pointer to a contiguous array, unique per-subwarp of size `kWarpWidth`
    *    (length: k <= kWarpWidth * kMaxArrLen).
+   * @param valF (optional) postprocess values (T -> OutT)
+   * @param idxF (optional) postprocess indices (IdxT -> OutIdxT)
    */
-  __device__ void store(T* out, IdxT* out_idx) const
+  template <typename OutT,
+            typename OutIdxT,
+            typename ValF = identity_op,
+            typename IdxF = identity_op>
+  _RAFT_DEVICE void store(OutT* out,
+                          OutIdxT* out_idx,
+                          ValF valF = raft::identity_op{},
+                          IdxF idxF = raft::identity_op{}) const
   {
     int idx = Pow2<kWarpWidth>::mod(laneId());
 #pragma unroll kMaxArrLen
     for (int i = 0; i < kMaxArrLen && idx < k; i++, idx += kWarpWidth) {
-      out[idx]     = val_arr_[i];
-      out_idx[idx] = idx_arr_[i];
+      out[idx]     = valF(val_arr_[i]);
+      out_idx[idx] = idxF(idx_arr_[i]);
     }
   }
 
@@ -245,8 +252,8 @@ class warp_sort {
    *   the associated indices of the elements in the same format as `keys_in`.
    */
   template <int PerThreadSizeIn>
-  __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in,
-                                           const IdxT* __restrict__ ids_in)
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_in(const T* __restrict__ keys_in,
+                                               const IdxT* __restrict__ ids_in)
   {
 #pragma unroll
     for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
@@ -257,7 +264,7 @@ class warp_sort {
         idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
       }
     }
-    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+    util::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
   }
 };
 
@@ -275,8 +282,9 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_filtered(int k, T limit)
+  explicit _RAFT_DEVICE warp_sort_filtered(int k, T limit = kDummy)
     : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(limit)
   {
 #pragma unroll
@@ -286,12 +294,14 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ __forceinline__ explicit warp_sort_filtered(int k)
-    : warp_sort_filtered<Capacity, Ascending, T, IdxT>(k, kDummy)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k,
+                                                            uint8_t* = nullptr,
+                                                            T limit  = kDummy)
   {
+    return warp_sort_filtered<Capacity, Ascending, T, IdxT>{k, limit};
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // comparing for k_th should reduce the total amount of updates:
     // `false` means the input value is surely not in the top-k values.
@@ -309,22 +319,22 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     if (do_add) { add_to_buf_(val, idx); }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (any(buf_len_ != 0)) { merge_buf_(); }
   }
 
  private:
-  __device__ __forceinline__ void set_k_th_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
   {
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
   }
 
-  __device__ __forceinline__ void merge_buf_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
   {
-    topk::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+    util::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
     this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
     buf_len_ = 0;
     set_k_th_();  // contains warp sync
@@ -334,7 +344,7 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ __forceinline__ void add_to_buf_(T val, IdxT idx)
+  _RAFT_DEVICE _RAFT_FORCEINLINE void add_to_buf_(T val, IdxT idx)
   {
     // NB: the loop is used here to ensure the constant indexing,
     //     to not force the buffers spill into the local memory.
@@ -373,8 +383,9 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_distributed(int k, T limit)
+  explicit _RAFT_DEVICE warp_sort_distributed(int k, T limit = kDummy)
     : warp_sort<Capacity, Ascending, T, IdxT>(k),
       buf_val_(kDummy),
       buf_idx_(IdxT{}),
@@ -383,12 +394,14 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   {
   }
 
-  __device__ __forceinline__ explicit warp_sort_distributed(int k)
-    : warp_sort_distributed<Capacity, Ascending, T, IdxT>(k, kDummy)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k,
+                                                            uint8_t* = nullptr,
+                                                            T limit  = kDummy)
   {
+    return warp_sort_distributed<Capacity, Ascending, T, IdxT>{k, limit};
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // mask tells which lanes in the warp have valid items to be added
     uint32_t mask = ballot(is_ordered<Ascending>(val, k_th_));
@@ -428,7 +441,7 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (buf_len_ != 0) {
       merge_buf_();
@@ -437,16 +450,16 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   }
 
  private:
-  __device__ __forceinline__ void set_k_th_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
   {
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
   }
 
-  __device__ __forceinline__ void merge_buf_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
   {
-    topk::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
+    util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
     this->merge_in<1>(&buf_val_, &buf_idx_);
     set_k_th_();  // contains warp sync
     buf_val_ = kDummy;
@@ -463,6 +476,117 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   T k_th_;
 };
 
+/**
+ * The same as `warp_sort_distributed`, but keeps the temporary value and index buffers
+ * in the given external pointers (normally, a shared memory pointer should be passed in).
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_distributed_ext : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+
+  constexpr static auto mem_required(uint32_t block_size) -> size_t
+  {
+    return (sizeof(T) + sizeof(IdxT)) * block_size;
+  }
+
+  _RAFT_DEVICE warp_sort_distributed_ext(int k, T* val_buf, IdxT* idx_buf, T limit = kDummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k),
+      val_buf_(val_buf),
+      idx_buf_(idx_buf),
+      buf_len_(0),
+      k_th_(limit)
+  {
+    val_buf_[laneId()] = kDummy;
+  }
+
+  _RAFT_DEVICE static auto init_blockwide(int k, uint8_t* shmem, T limit = kDummy)
+  {
+    T* val_buf    = nullptr;
+    IdxT* idx_buf = nullptr;
+    if constexpr (alignof(T) >= alignof(IdxT)) {
+      val_buf = reinterpret_cast<T*>(shmem);
+      idx_buf = reinterpret_cast<IdxT*>(val_buf + blockDim.x);
+    } else {
+      idx_buf = reinterpret_cast<IdxT*>(shmem);
+      val_buf = reinterpret_cast<T*>(idx_buf + blockDim.x);
+    }
+    auto warp_offset = Pow2<WarpSize>::roundDown(threadIdx.x);
+    val_buf += warp_offset;
+    idx_buf += warp_offset;
+    return warp_sort_distributed_ext<Capacity, Ascending, T, IdxT>{k, val_buf, idx_buf, limit};
+  }
+
+  _RAFT_DEVICE void add(T val, IdxT idx)
+  {
+    bool do_add = is_ordered<Ascending>(val, k_th_);
+    // mask tells which lanes in the warp have valid items to be added
+    uint32_t mask = ballot(do_add);
+    if (mask == 0) { return; }
+    // where to put the element in the tmp buffer
+    int dst_ix = buf_len_ + __popc(mask & ((1u << laneId()) - 1u));
+    // put all elements, which fit into the current tmp buffer
+    if (do_add && dst_ix < WarpSize) {
+      val_buf_[dst_ix] = val;
+      idx_buf_[dst_ix] = idx;
+      do_add           = false;
+    }
+    // Total number of elements to be added
+    buf_len_ += __popc(mask);
+    // If the buffer is still not full, we can return
+    if (buf_len_ < WarpSize) { return; }
+    // Otherwise, merge the warp tmp buffer into the queue
+    merge_buf_();  // implies warp sync
+    buf_len_ -= WarpSize;
+    // save the inputs that couldn't fit before the merge
+    if (do_add) {
+      dst_ix -= WarpSize;
+      val_buf_[dst_ix] = val;
+      idx_buf_[dst_ix] = idx;
+    }
+  }
+
+  _RAFT_DEVICE void done()
+  {
+    if (buf_len_ != 0) {
+      merge_buf_();
+      buf_len_ = 0;
+    }
+    __syncthreads();
+  }
+
+ private:
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
+  }
+
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
+  {
+    __syncwarp();  // make sure the threads are aware of the data written by others
+    T buf_val          = val_buf_[laneId()];
+    IdxT buf_idx       = idx_buf_[laneId()];
+    val_buf_[laneId()] = kDummy;
+    util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val, buf_idx);
+    this->merge_in<1>(&buf_val, &buf_idx);
+    set_k_th_();  // contains warp sync
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+
+  T* val_buf_;
+  IdxT* idx_buf_;
+  uint32_t buf_len_;  // 0 <= buf_len_ < WarpSize
+
+  T k_th_;
+};
+
 /**
  * This version of warp_sort adds every input element into the intermediate sorting
  * buffer, and thus does the sorting step every `Capacity` input elements.
@@ -475,8 +599,10 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_immediate(int k) : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0)
+  explicit _RAFT_DEVICE warp_sort_immediate(int k)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -485,7 +611,12 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k, uint8_t* = nullptr)
+  {
+    return warp_sort_immediate<Capacity, Ascending, T, IdxT>{k};
+  }
+
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // NB: the loop is used here to ensure the constant indexing,
     //     to not force the buffers spill into the local memory.
@@ -499,7 +630,7 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
 
     ++buf_len_;
     if (buf_len_ == kMaxArrLen) {
-      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+      util::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
 #pragma unroll
       for (int i = 0; i < kMaxArrLen; i++) {
@@ -509,10 +640,10 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (buf_len_ != 0) {
-      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+      util::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
     }
   }
@@ -544,15 +675,11 @@ class block_sort {
   using queue_t = WarpSortWarpWide<Capacity, Ascending, T, IdxT>;
 
   template <typename... Args>
-  __device__ block_sort(int k, uint8_t* smem_buf, Args... args) : queue_(k, args...)
+  _RAFT_DEVICE block_sort(int k, Args... args) : queue_(queue_t::init_blockwide(k, args...))
   {
-    val_smem_             = reinterpret_cast<T*>(smem_buf);
-    const int num_of_warp = subwarp_align::div(blockDim.x);
-    idx_smem_             = reinterpret_cast<IdxT*>(
-      smem_buf + Pow2<256>::roundUp(ceildiv(num_of_warp, 2) * sizeof(T) * k));
   }
 
-  __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
+  _RAFT_DEVICE void add(T val, IdxT idx) { queue_.add(val, idx); }
 
   /**
    * At the point of calling this function, the warp-level queues consumed all input
@@ -560,22 +687,26 @@ class block_sort {
    *
    * Here we tree-merge the results using the shared memory and block sync.
    */
-  __device__ void done()
+  _RAFT_DEVICE void done(uint8_t* smem_buf)
   {
     queue_.done();
 
+    int nwarps    = subwarp_align::div(blockDim.x);
+    auto val_smem = reinterpret_cast<T*>(smem_buf);
+    auto idx_smem = reinterpret_cast<IdxT*>(
+      smem_buf + Pow2<256>::roundUp(ceildiv(nwarps, 2) * sizeof(T) * queue_.k));
+
     const int warp_id = subwarp_align::div(threadIdx.x);
     // NB: there is no need for the second __synchthreads between .load_sorted and .store:
     //     we shift the pointers every iteration, such that individual warps either access the same
     //     locations or do not overlap with any of the other warps. The access patterns within warps
     //     are different for the two functions, but .load_sorted implies warp sync at the end, so
     //     there is no need for __syncwarp either.
-    for (int shift_mask = ~0, nwarps = subwarp_align::div(blockDim.x), split = (nwarps + 1) >> 1;
-         nwarps > 1;
+    for (int shift_mask = ~0, split = (nwarps + 1) >> 1; nwarps > 1;
          nwarps = split, split = (nwarps + 1) >> 1) {
       if (warp_id < nwarps && warp_id >= split) {
         int dst_warp_shift = (warp_id - (split & shift_mask)) * queue_.k;
-        queue_.store(val_smem_ + dst_warp_shift, idx_smem_ + dst_warp_shift);
+        queue_.store(val_smem + dst_warp_shift, idx_smem + dst_warp_shift);
       }
       __syncthreads();
 
@@ -585,22 +716,27 @@ class block_sort {
         // The last argument serves as a condition for loading
         //  -- to make sure threads within a full warp do not diverge on `bitonic::merge()`
         queue_.load_sorted(
-          val_smem_ + src_warp_shift, idx_smem_ + src_warp_shift, warp_id < nwarps - split);
+          val_smem + src_warp_shift, idx_smem + src_warp_shift, warp_id < nwarps - split);
       }
     }
   }
 
   /** Save the content by the pointer location. */
-  __device__ void store(T* out, IdxT* out_idx) const
+  template <typename OutT,
+            typename OutIdxT,
+            typename ValF = identity_op,
+            typename IdxF = identity_op>
+  _RAFT_DEVICE void store(OutT* out,
+                          OutIdxT* out_idx,
+                          ValF valF = raft::identity_op{},
+                          IdxF idxF = raft::identity_op{}) const
   {
-    if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx); }
+    if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, valF, idxF); }
   }
 
  private:
   using subwarp_align = Pow2<queue_t::kWarpWidth>;
   queue_t queue_;
-  T* val_smem_;
-  IdxT* idx_smem_;
 };
 
 /**
@@ -618,7 +754,10 @@ __launch_bounds__(256) __global__
   void block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
-  block_sort<WarpSortClass, Capacity, Ascending, T, IdxT> queue(k, smem_buf_bytes);
+  using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
+  uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
+  bq_t queue(k, warp_smem);
+
   in += blockIdx.y * len;
   if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
 
@@ -629,7 +768,7 @@ __launch_bounds__(256) __global__
               (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
-  queue.done();
+  queue.done(smem_buf_bytes);
   const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
   queue.store(out + block_id * k, out_idx + block_id * k);
 }
@@ -656,7 +795,7 @@ struct launch_setup {
                                   int* min_grid_size,
                                   int block_size_limit = 0)
   {
-    const int capacity = calc_capacity(k);
+    const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
@@ -689,7 +828,7 @@ struct launch_setup {
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
   {
-    const int capacity = calc_capacity(k);
+    const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
@@ -740,6 +879,18 @@ struct LaunchThreshold<warp_sort_filtered> {
   static constexpr int len_factor_for_single_block = 32;
 };
 
+template <>
+struct LaunchThreshold<warp_sort_distributed> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
+template <>
+struct LaunchThreshold<warp_sort_distributed_ext> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
 template <>
 struct LaunchThreshold<warp_sort_immediate> {
   static constexpr int len_factor_for_choosing     = 4;
@@ -751,7 +902,7 @@ template <template <int, bool, typename, typename> class WarpSortClass, typename
 void calc_launch_parameter(
   size_t batch_size, size_t len, int k, int* p_num_of_block, int* p_num_of_warp)
 {
-  const int capacity               = calc_capacity(k);
+  const int capacity               = bound_by_power_of_two(k);
   const int capacity_per_full_warp = std::max(capacity, WarpSize);
   int block_size                   = 0;
   int min_grid_size                = 0;
@@ -825,30 +976,30 @@ void calc_launch_parameter(
 }
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void warp_sort_topk_(int num_of_block,
-                     int num_of_warp,
-                     const T* in,
-                     const IdxT* in_idx,
-                     size_t batch_size,
-                     size_t len,
-                     int k,
-                     T* out,
-                     IdxT* out_idx,
-                     bool select_min,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = nullptr)
+void select_k_(int num_of_block,
+               int num_of_warp,
+               const T* in,
+               const IdxT* in_idx,
+               size_t batch_size,
+               size_t len,
+               int k,
+               T* out,
+               IdxT* out_idx,
+               bool select_min,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr = nullptr)
 {
   auto pool_guard = raft::get_pool_memory_resource(
     mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT)));
   if (pool_guard) {
-    RAFT_LOG_DEBUG("warp_sort_topk: using pool memory resource with initial size %zu bytes",
+    RAFT_LOG_DEBUG("warpsort::select_k: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
 
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
-  int capacity   = calc_capacity(k);
+  int capacity   = bound_by_power_of_two(k);
   int warp_width = std::min(capacity, WarpSize);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
@@ -856,6 +1007,8 @@ void warp_sort_topk_(int num_of_block,
   int block_dim    = num_of_warp * warp_width;
   int smem_size    = calc_smem_size_for_block_wide<T, IdxT>(num_of_warp, k);
 
+  smem_size = std::max<int>(smem_size, WarpSortClass<1, true, T, IdxT>::mem_required(block_dim));
+
   launch_setup<WarpSortClass, T, IdxT>::kernel(k,
                                                select_min,
                                                batch_size,
@@ -886,6 +1039,36 @@ void warp_sort_topk_(int num_of_block,
   }
 }
 
+template <typename T, typename IdxT, template <int, bool, typename, typename> class WarpSortClass>
+void select_k_impl(const T* in,
+                   const IdxT* in_idx,
+                   size_t batch_size,
+                   size_t len,
+                   int k,
+                   T* out,
+                   IdxT* out_idx,
+                   bool select_min,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  int num_of_block = 0;
+  int num_of_warp  = 0;
+  calc_launch_parameter<WarpSortClass, T, IdxT>(batch_size, len, k, &num_of_block, &num_of_warp);
+
+  select_k_<WarpSortClass, T, IdxT>(num_of_block,
+                                    num_of_warp,
+                                    in,
+                                    in_idx,
+                                    batch_size,
+                                    len,
+                                    k,
+                                    out,
+                                    out_idx,
+                                    select_min,
+                                    stream,
+                                    mr);
+}
+
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
@@ -924,23 +1107,23 @@ void warp_sort_topk_(int num_of_block,
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void warp_sort_topk(const T* in,
-                    const IdxT* in_idx,
-                    size_t batch_size,
-                    size_t len,
-                    int k,
-                    T* out,
-                    IdxT* out_idx,
-                    bool select_min,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
          "The `len` (%zu) does not fit the indexing type",
          len);
 
-  int capacity     = calc_capacity(k);
+  int capacity     = bound_by_power_of_two(k);
   int num_of_block = 0;
   int num_of_warp  = 0;
   calc_launch_parameter<warp_sort_immediate, T, IdxT>(
@@ -948,34 +1131,34 @@ void warp_sort_topk(const T* in,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
-                                                  num_of_warp,
-                                                  in,
-                                                  in_idx,
-                                                  batch_size,
-                                                  len,
-                                                  k,
-                                                  out,
-                                                  out_idx,
-                                                  select_min,
-                                                  stream,
-                                                  mr);
+    select_k_<warp_sort_immediate, T, IdxT>(num_of_block,
+                                            num_of_warp,
+                                            in,
+                                            in_idx,
+                                            batch_size,
+                                            len,
+                                            k,
+                                            out,
+                                            out_idx,
+                                            select_min,
+                                            stream,
+                                            mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<warp_sort_filtered, T, IdxT>(num_of_block,
-                                                 num_of_warp,
-                                                 in,
-                                                 in_idx,
-                                                 batch_size,
-                                                 len,
-                                                 k,
-                                                 out,
-                                                 out_idx,
-                                                 select_min,
-                                                 stream,
-                                                 mr);
+    select_k_<warp_sort_filtered, T, IdxT>(num_of_block,
+                                           num_of_warp,
+                                           in,
+                                           in_idx,
+                                           batch_size,
+                                           len,
+                                           k,
+                                           out,
+                                           out_idx,
+                                           select_min,
+                                           stream,
+                                           mr);
   }
 }
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::matrix::detail::select::warpsort
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
index 22862e43b6..22147e9f34 100644
--- a/cpp/include/raft/matrix/diagonal.cuh
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace raft::matrix {
  * @param[out] matrix: matrix of size n_rows x n_cols
  */
 template <typename m_t, typename idx_t, typename layout>
-void set_diagonal(const raft::handle_t& handle,
+void set_diagonal(raft::device_resources const& handle,
                   raft::device_vector_view<const m_t, idx_t> vec,
                   raft::device_matrix_view<m_t, idx_t, layout> matrix)
 {
@@ -55,7 +55,7 @@ void set_diagonal(const raft::handle_t& handle,
  * @param[out] vec: vector of length k = min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t, typename layout>
-void get_diagonal(const raft::handle_t& handle,
+void get_diagonal(raft::device_resources const& handle,
                   raft::device_matrix_view<const m_t, idx_t, layout> matrix,
                   raft::device_vector_view<m_t, idx_t> vec)
 {
@@ -74,7 +74,7 @@ void get_diagonal(const raft::handle_t& handle,
  * @param[inout] inout: square input matrix with size len x len
  */
 template <typename m_t, typename idx_t, typename layout>
-void invert_diagonal(const raft::handle_t& handle,
+void invert_diagonal(raft::device_resources const& handle,
                      raft::device_matrix_view<m_t, idx_t, layout> inout)
 {
   // TODO: Use get_diagonal for this to support rectangular
diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh
index 6a923fb0cc..7710789bfe 100644
--- a/cpp/include/raft/matrix/gather.cuh
+++ b/cpp/include/raft/matrix/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/detail/gather.cuh>
+#include <raft/util/itertools.hpp>
 
 namespace raft::matrix {
 
@@ -28,62 +29,68 @@ namespace raft::matrix {
  */
 
 /**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a map.
+ * @brief Copies rows from a source matrix into a destination matrix according to a map.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
+ * For each output row, read the index in the input matrix from the map and copy the row.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
+ * @tparam InputIteratorT  Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT    Input iterator type, for the map (may be a pointer type).
+ * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT          Index type.
+ *
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  map_length   The length of 'map', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT, typename MapIteratorT, typename OutputIteratorT, typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             cudaStream_t stream)
 {
   detail::gather(in, D, N, map, map_length, out, stream);
 }
 
 /**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a
- * transformed map.
+ * @brief Copies rows from a source matrix into a destination matrix according to a transformed map.
+ *
+ * For each output row, read the index in the input matrix from the map, apply a transformation to
+ * this input index and copy the row.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
- * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * @tparam InputIteratorT  Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT    Input iterator type, for the map (may be a pointer type).
+ * @tparam MapTransformOp  Unary lambda expression or operator type. MapTransformOp's result type
+ *                         must be convertible to IndexT.
+ * @tparam OutputIteratorT Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT          Index type.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
- * @param  transform_op The transformation operation, transforms the map values to IndexT
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  map_length   The length of 'map', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
+ * @param  transform_op Transformation to apply to map values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT, typename MapIteratorT, typename MapTransformOp>
-void gather(const MatrixIteratorT in,
-            int D,
-            int N,
-            MapIteratorT map,
-            int map_length,
-            MatrixIteratorT out,
+template <typename InputIteratorT,
+          typename MapIteratorT,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather(const InputIteratorT in,
+            IndexT D,
+            IndexT N,
+            const MapIteratorT map,
+            IndexT map_length,
+            OutputIteratorT out,
             MapTransformOp transform_op,
             cudaStream_t stream)
 {
@@ -91,40 +98,42 @@ void gather(const MatrixIteratorT in,
 }
 
 /**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a map.
+ * @brief Conditionally copies rows from a source matrix into a destination matrix.
+ *
+ * For each output row, read the index in the input matrix from the map, read a stencil value, apply
+ * a predicate to the stencil value, and if true, copy the row.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
- * @tparam StencilIteratorT     Random-access iterator type, for reading input stencil (may be a
- * simple pointer type).
- * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
- * type must be convertible to bool type.
+ * @tparam InputIteratorT   Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT     Input iterator type, for the map (may be a pointer type).
+ * @tparam StencilIteratorT Input iterator type, for the stencil (may be a pointer type).
+ * @tparam UnaryPredicateOp Unary lambda expression or operator type. UnaryPredicateOp's result type
+ *                          must be convertible to bool type.
+ * @tparam OutputIteratorT  Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT           Index type.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  stencil      Pointer to the input sequence of stencil or predicate values
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  stencil      Sequence of stencil values, dim = [map_length]
+ * @param  map_length   The length of 'map' and 'stencil', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
  * @param  pred_op      Predicate to apply to the stencil values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
-          typename UnaryPredicateOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename UnaryPredicateOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                cudaStream_t stream)
 {
@@ -132,44 +141,47 @@ void gather_if(const MatrixIteratorT in,
 }
 
 /**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a transformed map.
+ * @brief Conditionally copies rows according to a transformed map.
+ *
+ * For each output row, read the index in the input matrix from the map, read a stencil value,
+ * apply a predicate to the stencil value, and if true, apply a transformation to the input index
+ * and copy the row.
  *
- * @tparam MatrixIteratorT      Random-access iterator type, for reading input matrix (may be a
- * simple pointer type).
- * @tparam MapIteratorT         Random-access iterator type, for reading input map (may be a simple
- * pointer type).
- * @tparam StencilIteratorT     Random-access iterator type, for reading input stencil (may be a
- * simple pointer type).
- * @tparam UnaryPredicateOp     Unary lambda expression or operator type, UnaryPredicateOp's result
- * type must be convertible to bool type.
- * @tparam MapTransformOp       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to IndexT (= int) type.
+ * @tparam InputIteratorT   Input iterator type, for the input matrix (may be a pointer type).
+ * @tparam MapIteratorT     Input iterator type, for the map (may be a pointer type).
+ * @tparam MapTransformOp   Unary lambda expression or operator type. MapTransformOp's result type
+ *                          must be convertible to IndexT.
+ * @tparam StencilIteratorT Input iterator type, for the stencil (may be a pointer type).
+ * @tparam UnaryPredicateOp Unary lambda expression or operator type. UnaryPredicateOp's result type
+ *                          must be convertible to bool type.
+ * @tparam OutputIteratorT  Output iterator type, for the output matrix (may be a pointer type).
+ * @tparam IndexT           Index type.
  *
- * @param  in           Pointer to the input matrix (assumed to be row-major)
- * @param  D            Leading dimension of the input matrix 'in', which in-case of row-major
- * storage is the number of columns
- * @param  N            Second dimension
- * @param  map          Pointer to the input sequence of gather locations
- * @param  stencil      Pointer to the input sequence of stencil or predicate values
- * @param  map_length   The length of 'map' and 'stencil'
- * @param  out          Pointer to the output matrix (assumed to be row-major)
+ * @param  in           Input matrix, dim = [N, D] (row-major)
+ * @param  D            Number of columns of the input/output matrices
+ * @param  N            Number of rows of the input matrix
+ * @param  map          Map of row indices to gather, dim = [map_length]
+ * @param  stencil      Sequence of stencil values, dim = [map_length]
+ * @param  map_length   The length of 'map' and 'stencil', number of rows of the output matrix
+ * @param  out          Output matrix, dim = [map_length, D] (row-major)
  * @param  pred_op      Predicate to apply to the stencil values
- * @param  transform_op The transformation operation, transforms the map values to IndexT
+ * @param  transform_op Transformation to apply to map values
  * @param  stream       CUDA stream to launch kernels within
  */
-template <typename MatrixIteratorT,
+template <typename InputIteratorT,
           typename MapIteratorT,
           typename StencilIteratorT,
           typename UnaryPredicateOp,
-          typename MapTransformOp>
-void gather_if(const MatrixIteratorT in,
-               int D,
-               int N,
-               MapIteratorT map,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IndexT>
+void gather_if(const InputIteratorT in,
+               IndexT D,
+               IndexT N,
+               const MapIteratorT map,
                StencilIteratorT stencil,
-               int map_length,
-               MatrixIteratorT out,
+               IndexT map_length,
+               OutputIteratorT out,
                UnaryPredicateOp pred_op,
                MapTransformOp transform_op,
                cudaStream_t stream)
@@ -178,58 +190,31 @@ void gather_if(const MatrixIteratorT in,
 }
 
 /**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a map.
+ * @brief Copies rows from a source matrix into a destination matrix according to a transformed map.
  *
- * @tparam matrix_t      Matrix element type
- * @tparam map_t         Map vector type
- * @tparam idx_t integer type used for indexing
- * @param[in] handle            raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Vector of gather locations
- * @param[out]  out         Output matrix (assumed to be row-major)
- */
-template <typename matrix_t, typename map_t, typename idx_t>
-void gather(const raft::handle_t& handle,
-            raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
-            raft::device_vector_view<const map_t, idx_t> map,
-            raft::device_matrix_view<matrix_t, idx_t, row_major> out)
-{
-  RAFT_EXPECTS(out.extent(0) == map.extent(0),
-               "Number of rows in output matrix must equal the size of the map vector");
-  RAFT_EXPECTS(out.extent(1) == in.extent(1),
-               "Number of columns in input and output matrices must be equal.");
-
-  raft::matrix::detail::gather(
-    const_cast<matrix_t*>(in.data_handle()),  // TODO: There's a better way to handle this
-    static_cast<int>(in.extent(1)),
-    static_cast<int>(in.extent(0)),
-    map.data_handle(),
-    static_cast<int>(map.extent(0)),
-    out.data_handle(),
-    handle.get_stream());
-}
-
-/**
- * @brief  gather copies rows from a source matrix into a destination matrix according to a
- * transformed map.
+ * For each output row, read the index in the input matrix from the map, apply a transformation to
+ * this input index if specified, and copy the row.
  *
- * @tparam matrix_t     Matrix type
- * @tparam map_t        Map vector type
- * @tparam map_xform_t       Unary lambda expression or operator type, MapTransformOp's result
- * type must be convertible to idx_t (= int) type.
- * @tparam idx_t integer type for indexing
- * @param[in] handle        raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Input vector of gather locations
- * @param[out]  out         Output matrix (assumed to be row-major)
- * @param[in]  transform_op The transformation operation, transforms the map values to idx_t
+ * @tparam matrix_t    Matrix element type
+ * @tparam map_t       Integer type of map elements
+ * @tparam idx_t       Integer type used for indexing
+ * @tparam map_xform_t Unary lambda expression or operator type. MapTransformOp's result type must
+ *                     be convertible to idx_t.
+ * @param[in]  handle        raft handle for managing resources
+ * @param[in]  in            Input matrix, dim = [N, D] (row-major)
+ * @param[in]  map           Map of row indices to gather, dim = [map_length]
+ * @param[out] out           Output matrix, dim = [map_length, D] (row-major)
+ * @param[in]  transform_op  (optional) Transformation to apply to map values
  */
-template <typename matrix_t, typename map_t, typename map_xform_t, typename idx_t>
-void gather(const raft::handle_t& handle,
+template <typename matrix_t,
+          typename map_t,
+          typename idx_t,
+          typename map_xform_t = raft::identity_op>
+void gather(const raft::device_resources& handle,
             raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
             raft::device_vector_view<const map_t, idx_t> map,
-            raft::device_matrix_view<const matrix_t, idx_t, row_major> out,
-            map_xform_t transform_op)
+            raft::device_matrix_view<matrix_t, idx_t, row_major> out,
+            map_xform_t transform_op = raft::identity_op())
 {
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
@@ -238,95 +223,51 @@ void gather(const raft::handle_t& handle,
 
   detail::gather(
     const_cast<matrix_t*>(in.data_handle()),  // TODO: There's a better way to handle this
-    static_cast<int>(in.extent(1)),
-    static_cast<int>(in.extent(0)),
-    map,
-    static_cast<int>(map.extent(0)),
+    in.extent(1),
+    in.extent(0),
+    map.data_handle(),
+    map.extent(0),
     out.data_handle(),
     transform_op,
     handle.get_stream());
 }
 
 /**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a map.
+ * @brief Conditionally copies rows according to a transformed map.
+ *
+ * For each output row, read the index in the input matrix from the map, read a stencil value,
+ * apply a predicate to the stencil value, and if true, apply a transformation if specified to the
+ * input index, and copy the row.
  *
- * @tparam matrix_t      Matrix value type
- * @tparam map_t         Map vector type
- * @tparam stencil_t     Stencil vector type
- * @tparam unary_pred_t     Unary lambda expression or operator type, unary_pred_t's result
- * type must be convertible to bool type.
- * @tparam idx_t integer type for indexing
- * @param[in] handle        raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Input vector of gather locations
- * @param[in]  stencil      Input vector of stencil or predicate values
- * @param[out]  out         Output matrix (assumed to be row-major)
- * @param[in]  pred_op      Predicate to apply to the stencil values
+ * @tparam matrix_t     Matrix element type
+ * @tparam map_t        Integer type of map elements
+ * @tparam stencil_t    Value type for stencil (input type for the pred_op)
+ * @tparam unary_pred_t Unary lambda expression or operator type. unary_pred_t's result
+ *                      type must be convertible to bool type.
+ * @tparam map_xform_t  Unary lambda expression or operator type. MapTransformOp's result type must
+ *                      be convertible to idx_t.
+ * @tparam idx_t        Integer type used for indexing
+ * @param[in]  handle        raft handle for managing resources
+ * @param[in]  in            Input matrix, dim = [N, D] (row-major)
+ * @param[in]  map           Map of row indices to gather, dim = [map_length]
+ * @param[in]  stencil       Vector of stencil values, dim = [map_length]
+ * @param[out] out           Output matrix, dim = [map_length, D] (row-major)
+ * @param[in]  pred_op       Predicate to apply to the stencil values
+ * @param[in]  transform_op  (optional) Transformation to apply to map values
  */
 template <typename matrix_t,
           typename map_t,
           typename stencil_t,
           typename unary_pred_t,
-          typename idx_t>
-void gather_if(const raft::handle_t& handle,
+          typename idx_t,
+          typename map_xform_t = raft::identity_op>
+void gather_if(const raft::device_resources& handle,
                raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
                raft::device_matrix_view<matrix_t, idx_t, row_major> out,
                raft::device_vector_view<const map_t, idx_t> map,
                raft::device_vector_view<const stencil_t, idx_t> stencil,
-               unary_pred_t pred_op)
-{
-  RAFT_EXPECTS(out.extent(0) == map.extent(0),
-               "Number of rows in output matrix must equal the size of the map vector");
-  RAFT_EXPECTS(out.extent(1) == in.extent(1),
-               "Number of columns in input and output matrices must be equal.");
-  RAFT_EXPECTS(map.extent(0) == stencil.extent(0),
-               "Number of elements in stencil must equal number of elements in map");
-
-  detail::gather_if(const_cast<matrix_t*>(in.data_handle()),
-                    out.extent(1),
-                    out.extent(0),
-                    map.data_handle(),
-                    stencil.data_handle(),
-                    map.extent(0),
-                    out.data_handle(),
-                    pred_op,
-                    handle.get_stream());
-}
-
-/**
- * @brief  gather_if conditionally copies rows from a source matrix into a destination matrix
- * according to a transformed map.
- *
- * @tparam matrix_t      Matrix value type, for reading input matrix
- * @tparam map_t         Vector value type for map
- * @tparam stencil_t     Vector value type for stencil
- * @tparam unary_pred_t     Unary lambda expression or operator type, unary_pred_t's result
- * type must be convertible to bool type.
- * @tparam map_xform_t       Unary lambda expression or operator type, map_xform_t's result
- * type must be convertible to idx_t (= int) type.
- * @tparam idx_t integer type for indexing
- * @param[in] handle        raft handle for managing resources
- * @param[in]  in           Input matrix (assumed to be row-major)
- * @param[in]  map          Vector of gather locations
- * @param[in]  stencil      Vector of stencil or predicate values
- * @param[out]  out          Output matrix (assumed to be row-major)
- * @param[in]  pred_op      Predicate to apply to the stencil values
- * @param[in]  transform_op The transformation operation, transforms the map values to idx_t
- */
-template <typename matrix_t,
-          typename map_t,
-          typename stencil_t,
-          typename unary_pred_t,
-          typename map_xform_t,
-          typename idx_t>
-void gather_if(const raft::handle_t& handle,
-               raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
-               raft::device_matrix_view<matrix_t, idx_t, row_major> out,
-               raft::device_vector_view<const map_t> map,
-               raft::device_vector_view<const stencil_t> stencil,
                unary_pred_t pred_op,
-               map_xform_t transform_op)
+               map_xform_t transform_op = raft::identity_op())
 {
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index faf65a96fd..f597bbd1c6 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ namespace raft::matrix {
  * @param[in] scalar scalar value to fill matrix elements
  */
 template <typename math_t, typename extents, typename layout>
-void fill(const raft::handle_t& handle,
+void fill(raft::device_resources const& handle,
           raft::device_mdspan<const math_t, extents, layout> in,
           raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
@@ -61,7 +61,7 @@ void fill(const raft::handle_t& handle,
  * @param[in] scalar scalar value to fill matrix elements
  */
 template <typename math_t, typename extents, typename layout>
-void fill(const raft::handle_t& handle,
+void fill(raft::device_resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh
index cd80b539c4..33de112a35 100644
--- a/cpp/include/raft/matrix/linewise_op.cuh
+++ b/cpp/include/raft/matrix/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ template <typename m_t,
           typename Lambda,
           typename... vec_t,
           typename = raft::enable_if_device_mdspan<vec_t...>>
-void linewise_op(const raft::handle_t& handle,
+void linewise_op(raft::device_resources const& handle,
                  raft::device_matrix_view<const m_t, idx_t, layout> in,
                  raft::device_matrix_view<m_t, idx_t, layout> out,
                  const bool alongLines,
@@ -97,7 +97,7 @@ template <typename m_t,
           typename Lambda,
           typename... vec_t,
           typename = raft::enable_if_device_mdspan<vec_t...>>
-void linewise_op(const raft::handle_t& handle,
+void linewise_op(raft::device_resources const& handle,
                  raft::device_aligned_matrix_view<const m_t, idx_t, layout> in,
                  raft::device_aligned_matrix_view<m_t, idx_t, layout> out,
                  const bool alongLines,
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index fd5ddf2df3..7afb9572be 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -295,7 +295,7 @@ void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_
  */
 template <typename math_t, typename IdxType = int>
 void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+  raft::device_resources const& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
   detail::ratio(handle, src, dest, len, stream);
 }
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index cd6c4fa219..0780e41275 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stre
  * @param[out] out: output matrix
  */
 template <typename m_t, typename idx_t = int, typename matrix_idx_t>
-void copy(const raft::handle_t& handle,
+void copy(raft::device_resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, col_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, col_major> out)
 {
@@ -252,7 +252,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+m_t getL2Norm(raft::device_resources const& handle, m_t* in, idx_t size, cudaStream_t stream)
 {
   return detail::getL2Norm(handle, in, size, stream);
 }
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index c37b3995d8..eb94a19669 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace raft::matrix {
  * @returns matrix l2 norm
  */
 template <typename m_t, typename idx_t>
-m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<const m_t, idx_t> in)
+m_t l2_norm(raft::device_resources const& handle, raft::device_mdspan<const m_t, idx_t> in)
 {
   return detail::getL2Norm(handle, in.data_handle(), in.size(), handle.get_stream());
 }
diff --git a/cpp/include/raft/matrix/power.cuh b/cpp/include/raft/matrix/power.cuh
index 2bdbd475ca..c7c3757193 100644
--- a/cpp/include/raft/matrix/power.cuh
+++ b/cpp/include/raft/matrix/power.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace raft::matrix {
  * @param[in] scalar: every element is multiplied with scalar.
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_power(const raft::handle_t& handle,
+void weighted_power(raft::device_resources const& handle,
                     raft::device_matrix_view<const math_t, idx_t, layout> in,
                     raft::device_matrix_view<math_t, idx_t, layout> out,
                     math_t scalar)
@@ -56,7 +56,7 @@ void weighted_power(const raft::handle_t& handle,
  * @param[in] scalar: every element is multiplied with scalar.
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_power(const raft::handle_t& handle,
+void weighted_power(raft::device_resources const& handle,
                     raft::device_matrix_view<math_t, idx_t, layout> inout,
                     math_t scalar)
 {
@@ -72,7 +72,8 @@ void weighted_power(const raft::handle_t& handle,
  * @param[inout] inout: input matrix and also the result is stored
  */
 template <typename math_t, typename idx_t, typename layout>
-void power(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
+void power(raft::device_resources const& handle,
+           raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
   detail::power<math_t>(inout.data_handle(), inout.size(), handle.get_stream());
 }
@@ -88,7 +89,7 @@ void power(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t,
  * @{
  */
 template <typename math_t, typename idx_t, typename layout>
-void power(const raft::handle_t& handle,
+void power(raft::device_resources const& handle,
            raft::device_matrix_view<const math_t, idx_t, layout> in,
            raft::device_matrix_view<math_t, idx_t, layout> out)
 {
diff --git a/cpp/include/raft/matrix/print.cuh b/cpp/include/raft/matrix/print.cuh
index 935af8233b..6a4bfbdd01 100644
--- a/cpp/include/raft/matrix/print.cuh
+++ b/cpp/include/raft/matrix/print.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ namespace raft::matrix {
  * @param[in] separators: horizontal and vertical separator characters
  */
 template <typename m_t, typename idx_t>
-void print(const raft::handle_t& handle,
+void print(raft::device_resources const& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            print_separators& separators)
 {
diff --git a/cpp/include/raft/matrix/ratio.cuh b/cpp/include/raft/matrix/ratio.cuh
index 2e449698d5..cd96d1ffbc 100644
--- a/cpp/include/raft/matrix/ratio.cuh
+++ b/cpp/include/raft/matrix/ratio.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ namespace raft::matrix {
  * @param[out] dest: output matrix. The result is stored in the dest matrix
  */
 template <typename math_t, typename idx_t, typename layout>
-void ratio(const raft::handle_t& handle,
+void ratio(raft::device_resources const& handle,
            raft::device_matrix_view<const math_t, idx_t, layout> src,
            raft::device_matrix_view<math_t, idx_t, layout> dest)
 {
@@ -53,7 +53,8 @@ void ratio(const raft::handle_t& handle,
  * @param[inout] inout: input matrix
  */
 template <typename math_t, typename idx_t, typename layout>
-void ratio(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
+void ratio(raft::device_resources const& handle,
+           raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
   detail::ratio(
     handle, inout.data_handle(), inout.data_handle(), inout.size(), handle.get_stream());
diff --git a/cpp/include/raft/matrix/reciprocal.cuh b/cpp/include/raft/matrix/reciprocal.cuh
index f50216a153..aa2c48e143 100644
--- a/cpp/include/raft/matrix/reciprocal.cuh
+++ b/cpp/include/raft/matrix/reciprocal.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ namespace raft::matrix {
  * @{
  */
 template <typename math_t, typename idx_t, typename layout>
-void reciprocal(const raft::handle_t& handle,
+void reciprocal(raft::device_resources const& handle,
                 raft::device_matrix_view<const math_t, idx_t, layout> in,
                 raft::device_matrix_view<math_t, idx_t, layout> out,
                 raft::host_scalar_view<math_t> scalar,
@@ -70,7 +70,7 @@ void reciprocal(const raft::handle_t& handle,
  * @{
  */
 template <typename math_t, typename idx_t, typename layout>
-void reciprocal(const raft::handle_t& handle,
+void reciprocal(raft::device_resources const& handle,
                 raft::device_matrix_view<math_t, idx_t, layout> inout,
                 raft::host_scalar_view<math_t> scalar,
                 bool setzero = false,
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 6f1235b059..3aaec56fee 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,8 @@ namespace raft::matrix {
  * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t, typename layout_t>
-void col_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
+void col_reverse(raft::device_resources const& handle,
+                 raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
@@ -51,7 +52,8 @@ void col_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx
  * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t, typename layout_t>
-void row_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
+void row_reverse(raft::device_resources const& handle,
+                 raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
new file mode 100644
index 0000000000..9a1a14fd73
--- /dev/null
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/select_k.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
+
+#include <optional>
+
+namespace raft::matrix {
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace raft;
+ *   // get a 2D row-major array of values to search through
+ *   auto in_values = {... input device_matrix_view<const float, size_t, row_major> ...}
+ *   // prepare output arrays
+ *   auto out_extents = make_extents<size_t>(in_values.extent(0), k);
+ *   auto out_values  = make_device_mdarray<float>(handle, out_extents);
+ *   auto out_indices = make_device_mdarray<size_t>(handle, out_extents);
+ *   // search `k` smallest values in each row
+ *   matrix::select_k<float, size_t>(
+ *     handle, in_values, std::nullopt, out_values.view(), out_indices.view(), true);
+ * @endcode
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] handle
+ * @param[in] in_val
+ *   inputs values [batch_size, len];
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   optional input payload [batch_size, len];
+ *   typically, these are indices of the corresponding `in_val`.
+ *   If `in_idx` is `std::nullopt`, a contiguous array `0...len-1` is implied.
+ * @param[out] out_val
+ *   output values [batch_size, k];
+ *   the k smallest/largest values from each row of the `in_val`.
+ * @param[out] out_idx
+ *   output payload (e.g. indices) [batch_size, k];
+ *   the payload selected together with `out_val`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ */
+template <typename T, typename IdxT>
+void select_k(const device_resources& handle,
+              raft::device_matrix_view<const T, size_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const IdxT, size_t, row_major>> in_idx,
+              raft::device_matrix_view<T, size_t, row_major> out_val,
+              raft::device_matrix_view<IdxT, size_t, row_major> out_idx,
+              bool select_min)
+{
+  RAFT_EXPECTS(out_val.extent(1) <= size_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+  auto batch_size = in_val.extent(0);
+  auto len        = in_val.extent(1);
+  auto k          = int(out_val.extent(1));
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(batch_size == in_idx->extent(0), "batch sizes must be equal");
+    RAFT_EXPECTS(len == in_idx->extent(1), "value and index input lengths must be equal");
+  }
+  RAFT_EXPECTS(size_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
+  return detail::select_k<T, IdxT>(in_val.data_handle(),
+                                   in_idx.has_value() ? in_idx->data_handle() : nullptr,
+                                   batch_size,
+                                   len,
+                                   k,
+                                   out_val.data_handle(),
+                                   out_idx.data_handle(),
+                                   select_min,
+                                   handle.get_stream());
+}
+
+/** @} */  // end of group select_k
+
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/sign_flip.cuh b/cpp/include/raft/matrix/sign_flip.cuh
index 8cabfb37a2..d069c55880 100644
--- a/cpp/include/raft/matrix/sign_flip.cuh
+++ b/cpp/include/raft/matrix/sign_flip.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ namespace raft::matrix {
  * @param[inout] inout: input matrix. Result also stored in this parameter
  */
 template <typename math_t, typename idx_t>
-void sign_flip(const raft::handle_t& handle,
+void sign_flip(raft::device_resources const& handle,
                raft::device_matrix_view<math_t, idx_t, col_major> inout)
 {
   detail::signFlip(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index 016f762a70..bb92b2b86f 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ struct slice_coordinates {
  * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice(handle, in, out, {0, 1, 4, 3});
  */
 template <typename m_t, typename idx_t>
-void slice(const raft::handle_t& handle,
+void slice(raft::device_resources const& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            raft::device_matrix_view<m_t, idx_t, col_major> out,
            slice_coordinates<idx_t> coords)
diff --git a/cpp/include/raft/matrix/sqrt.cuh b/cpp/include/raft/matrix/sqrt.cuh
index ca9b280773..9729f9b3d5 100644
--- a/cpp/include/raft/matrix/sqrt.cuh
+++ b/cpp/include/raft/matrix/sqrt.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ namespace raft::matrix {
  * @param[out] out: output matrix. The result is stored in the out matrix
  */
 template <typename math_t, typename idx_t, typename layout>
-void sqrt(const raft::handle_t& handle,
+void sqrt(raft::device_resources const& handle,
           raft::device_matrix_view<const math_t, idx_t, layout> in,
           raft::device_matrix_view<math_t, idx_t, layout> out)
 {
@@ -55,7 +55,8 @@ void sqrt(const raft::handle_t& handle,
  * @param[inout] inout: input matrix with in-place results
  */
 template <typename math_t, typename idx_t, typename layout>
-void sqrt(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
+void sqrt(raft::device_resources const& handle,
+          raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
   detail::seqRoot(inout.data_handle(), inout.size(), handle.get_stream());
 }
@@ -72,7 +73,7 @@ void sqrt(const raft::handle_t& handle, raft::device_matrix_view<math_t, idx_t,
  * @param[in] set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_sqrt(const raft::handle_t& handle,
+void weighted_sqrt(raft::device_resources const& handle,
                    raft::device_matrix_view<const math_t, idx_t, layout> in,
                    raft::device_matrix_view<math_t, idx_t, layout> out,
                    raft::host_scalar_view<math_t> scalar,
@@ -98,7 +99,7 @@ void weighted_sqrt(const raft::handle_t& handle,
  * @param[in] set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename idx_t, typename layout>
-void weighted_sqrt(const raft::handle_t& handle,
+void weighted_sqrt(raft::device_resources const& handle,
                    raft::device_matrix_view<math_t, idx_t, layout> inout,
                    raft::host_scalar_view<math_t> scalar,
                    bool set_neg_zero = false)
diff --git a/cpp/include/raft/matrix/threshold.cuh b/cpp/include/raft/matrix/threshold.cuh
index 5ac7136a26..7dfb264d34 100644
--- a/cpp/include/raft/matrix/threshold.cuh
+++ b/cpp/include/raft/matrix/threshold.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace raft::matrix {
  * @param[in] thres threshold to set values to zero
  */
 template <typename math_t, typename idx_t, typename layout>
-void zero_small_values(const raft::handle_t& handle,
+void zero_small_values(raft::device_resources const& handle,
                        raft::device_matrix_view<const math_t, idx_t, layout> in,
                        raft::device_matrix_view<math_t, idx_t, layout> out,
                        math_t thres = 1e-15)
@@ -57,7 +57,7 @@ void zero_small_values(const raft::handle_t& handle,
  * @param thres: threshold
  */
 template <typename math_t, typename idx_t, typename layout>
-void zero_small_values(const raft::handle_t& handle,
+void zero_small_values(raft::device_resources const& handle,
                        raft::device_matrix_view<math_t, idx_t, layout> inout,
                        math_t thres = 1e-15)
 {
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
index 7820af8b16..3c60cc362f 100644
--- a/cpp/include/raft/matrix/triangular.cuh
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace raft::matrix {
  * @param[out] dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t>
-void upper_triangular(const raft::handle_t& handle,
+void upper_triangular(raft::device_resources const& handle,
                       raft::device_matrix_view<const m_t, idx_t, col_major> src,
                       raft::device_matrix_view<m_t, idx_t, col_major> dst)
 {
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index 3818399e5f..619c57a35a 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,12 +39,12 @@ namespace raft::neighbors::ball_cover {
  * Usage example:
  * @code{.cpp}
  *
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/ball_cover.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2Expanded;
  *  BallCoverIndex index(handle, X, metric);
@@ -60,7 +60,7 @@ namespace raft::neighbors::ball_cover {
  * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void build_index(const raft::handle_t& handle,
+void build_index(raft::device_resources const& handle,
                  BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
 {
   ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
@@ -109,7 +109,7 @@ void build_index(const raft::handle_t& handle,
  *               looking in the closest landmark.
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void all_knn_query(const raft::handle_t& handle,
+void all_knn_query(raft::device_resources const& handle,
                    BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                    int_t k,
                    idx_t* inds,
@@ -163,12 +163,12 @@ void all_knn_query(const raft::handle_t& handle,
  * Usage example:
  * @code{.cpp}
  *
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/ball_cover.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2Expanded;
  *
@@ -202,7 +202,7 @@ void all_knn_query(const raft::handle_t& handle,
  *               looking in the closest landmark.
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void all_knn_query(const raft::handle_t& handle,
+void all_knn_query(raft::device_resources const& handle,
                    BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
                    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
@@ -256,7 +256,7 @@ void all_knn_query(const raft::handle_t& handle,
  * @param[in] n_query_pts number of query points
  */
 template <typename idx_t, typename value_t, typename int_t>
-void knn_query(const raft::handle_t& handle,
+void knn_query(raft::device_resources const& handle,
                const BallCoverIndex<idx_t, value_t, int_t>& index,
                int_t k,
                const value_t* query,
@@ -311,12 +311,12 @@ void knn_query(const raft::handle_t& handle,
  * Usage example:
  * @code{.cpp}
  *
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/ball_cover.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2Expanded;
  *
@@ -352,7 +352,7 @@ void knn_query(const raft::handle_t& handle,
  *               looking in the closest landmark.
  */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void knn_query(const raft::handle_t& handle,
+void knn_query(raft::device_resources const& handle,
                const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
                raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
diff --git a/cpp/include/raft/neighbors/ball_cover_types.hpp b/cpp/include/raft/neighbors/ball_cover_types.hpp
index 1c5babbc4e..8cab1469fc 100644
--- a/cpp/include/raft/neighbors/ball_cover_types.hpp
+++ b/cpp/include/raft/neighbors/ball_cover_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cstdint>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -45,7 +45,7 @@ template <typename value_idx,
           typename matrix_idx = std::uint32_t>
 class BallCoverIndex {
  public:
-  explicit BallCoverIndex(const raft::handle_t& handle_,
+  explicit BallCoverIndex(raft::device_resources const& handle_,
                           const value_t* X_,
                           value_int m_,
                           value_int n_,
@@ -71,7 +71,7 @@ class BallCoverIndex {
   {
   }
 
-  explicit BallCoverIndex(const raft::handle_t& handle_,
+  explicit BallCoverIndex(raft::device_resources const& handle_,
                           raft::device_matrix_view<const value_t, matrix_idx, row_major> X_,
                           raft::distance::DistanceType metric_)
     : handle(handle_),
@@ -139,7 +139,7 @@ class BallCoverIndex {
   // This should only be set by internal functions
   void set_index_trained() { index_trained = true; }
 
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
 
   value_int m;
   value_int n;
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index 8d94ffd9a7..ac9d14ce17 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,11 +54,11 @@ namespace raft::neighbors::brute_force {
  *
  * Usage example:
  * @code{.cpp}
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/brute_force.cuh>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  compute multiple knn graphs and aggregate row-wise
  *  (see detailed description above)
@@ -79,7 +79,7 @@ namespace raft::neighbors::brute_force {
  */
 template <typename value_t, typename idx_t>
 inline void knn_merge_parts(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
   raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
   raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
@@ -116,12 +116,12 @@ inline void knn_merge_parts(
  *
  * Usage example:
  * @code{.cpp}
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/brute_force.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  int k = 10;
  *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
@@ -146,7 +146,7 @@ template <typename idx_t,
           typename matrix_idx,
           typename index_layout,
           typename search_layout>
-void knn(raft::handle_t const& handle,
+void knn(raft::device_resources const& handle,
          std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
          raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
          raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
@@ -207,12 +207,12 @@ void knn(raft::handle_t const& handle,
  *
  * Usage example:
  * @code{.cpp}
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/neighbors/brute_force.cuh>
  *  #include <raft/distance/distance_types.hpp>
  *  using namespace raft::neighbors;
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
  *  brute_force::fused_l2_knn(handle, index, search, indices, distances, metric);
@@ -230,7 +230,7 @@ void knn(raft::handle_t const& handle,
  * @param[in] metric type of distance computation to perform (must be a variant of L2)
  */
 template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
-void fused_l2_knn(const raft::handle_t& handle,
+void fused_l2_knn(raft::device_resources const& handle,
                   raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
                   raft::device_matrix_view<const value_t, idx_t, query_layout> query,
                   raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index 7d3779c89e..b264643584 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -43,9 +43,9 @@ void check_input(extents_t dataset,
   auto n_queries = queries.extent(0);
   auto k         = distances.extent(1);
 
-  RAFT_EXPECTS(k <= raft::spatial::knn::detail::topk::kMaxCapacity,
+  RAFT_EXPECTS(k <= raft::matrix::detail::select::warpsort::kMaxCapacity,
                "k must be lest than topk::kMaxCapacity (%d).",
-               raft::spatial::knn::detail::topk::kMaxCapacity);
+               raft::matrix::detail::select::warpsort::kMaxCapacity);
 
   RAFT_EXPECTS(indices.extent(0) == n_queries && distances.extent(0) == n_queries &&
                  candidates.extent(0) == n_queries,
@@ -72,7 +72,7 @@ void check_input(extents_t dataset,
  * See raft::neighbors::refine for docs.
  */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine_device(raft::handle_t const& handle,
+void refine_device(raft::device_resources const& handle,
                    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
                    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
                    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
diff --git a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
index ee92222066..7db5ef6877 100644
--- a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
 
 namespace raft::neighbors::epsilon_neighborhood {
@@ -72,10 +72,10 @@ void epsUnexpL2SqNeighborhood(bool* adj,
  *
  * @code{.cpp}
  *  #include <raft/neighbors/epsilon_neighborhood.cuh>
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/core/device_mdarray.hpp>
  *  using namespace raft::neighbors;
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  auto adj = raft::make_device_matrix<bool>(handle, m * n);
  *  auto vd = raft::make_device_vector<int>(handle, m+1);
@@ -97,7 +97,7 @@ void epsUnexpL2SqNeighborhood(bool* adj,
  *                    squared as we compute L2-squared distance in this method)
  */
 template <typename value_t, typename idx_t, typename matrix_idx_t>
-void eps_neighbors_l2sq(const raft::handle_t& handle,
+void eps_neighbors_l2sq(raft::device_resources const& handle,
                         raft::device_matrix_view<const value_t, matrix_idx_t, row_major> x,
                         raft::device_matrix_view<const value_t, matrix_idx_t, row_major> y,
                         raft::device_matrix_view<bool, matrix_idx_t, row_major> adj,
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
index e05f63ef61..f18611b9f1 100644
--- a/cpp/include/raft/neighbors/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <raft/spatial/knn/detail/ivf_flat_build.cuh>
 #include <raft/spatial/knn/detail/ivf_flat_search.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/core/device_mdspan.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -61,9 +61,11 @@ namespace raft::neighbors::ivf_flat {
  * @return the constructed ivf-flat index
  */
 template <typename T, typename IdxT>
-auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<T, IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<T, IdxT>
 {
   return raft::spatial::knn::ivf_flat::detail::build(handle, params, dataset, n_rows, dim);
 }
@@ -106,7 +108,7 @@ auto build(
  * @return the constructed ivf-flat index
  */
 template <typename value_t, typename idx_t>
-auto build(const handle_t& handle,
+auto build(raft::device_resources const& handle,
            raft::device_matrix_view<const value_t, idx_t, row_major> dataset,
            const index_params& params) -> index<value_t, idx_t>
 {
@@ -152,7 +154,7 @@ auto build(const handle_t& handle,
  * @return the constructed extended ivf-flat index
  */
 template <typename T, typename IdxT>
-auto extend(const handle_t& handle,
+auto extend(raft::device_resources const& handle,
             const index<T, IdxT>& orig_index,
             const T* new_vectors,
             const IdxT* new_indices,
@@ -201,7 +203,7 @@ auto extend(const handle_t& handle,
  * @return the constructed extended ivf-flat index
  */
 template <typename value_t, typename idx_t>
-auto extend(const handle_t& handle,
+auto extend(raft::device_resources const& handle,
             const index<value_t, idx_t>& orig_index,
             raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
             std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
@@ -244,7 +246,7 @@ auto extend(const handle_t& handle,
  * @param[in] n_rows the number of samples
  */
 template <typename T, typename IdxT>
-void extend(const handle_t& handle,
+void extend(raft::device_resources const& handle,
             index<T, IdxT>* index,
             const T* new_vectors,
             const IdxT* new_indices,
@@ -286,7 +288,7 @@ void extend(const handle_t& handle,
  *    here to imply a continuous range `[0...n_rows)`.
  */
 template <typename value_t, typename idx_t>
-void extend(const handle_t& handle,
+void extend(raft::device_resources const& handle,
             index<value_t, idx_t>* index,
             raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
             std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices = std::nullopt)
@@ -343,7 +345,7 @@ void extend(const handle_t& handle,
  * enough memory pool here to avoid memory allocations within search).
  */
 template <typename T, typename IdxT>
-void search(const handle_t& handle,
+void search(raft::device_resources const& handle,
             const search_params& params,
             const index<T, IdxT>& index,
             const T* queries,
@@ -398,7 +400,7 @@ void search(const handle_t& handle,
  * @param[in] k the number of neighbors to find for each query.
  */
 template <typename value_t, typename idx_t, typename int_t>
-void search(const handle_t& handle,
+void search(raft::device_resources const& handle,
             const index<value_t, idx_t>& index,
             raft::device_matrix_view<const value_t, idx_t, row_major> queries,
             raft::device_matrix_view<idx_t, idx_t, row_major> neighbors,
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index eea6ae256d..d234822a23 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -223,7 +223,7 @@ struct index : ann::index {
   ~index()                          = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
+  index(raft::device_resources const& handle,
         raft::distance::DistanceType metric,
         uint32_t n_lists,
         bool adaptive_centers,
@@ -243,7 +243,7 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle, const index_params& params, uint32_t dim)
+  index(raft::device_resources const& handle, const index_params& params, uint32_t dim)
     : index(handle, params.metric, params.n_lists, params.adaptive_centers, dim)
   {
   }
@@ -252,14 +252,21 @@ struct index : ann::index {
    * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
    * of data.
    */
-  void allocate(const handle_t& handle, IdxT index_size, bool allocate_center_norms)
+  void allocate(raft::device_resources const& handle, IdxT index_size)
   {
     data_    = make_device_mdarray<T>(handle, make_extents<IdxT>(index_size, dim()));
     indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
-    center_norms_ =
-      allocate_center_norms
-        ? std::optional(make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists())))
-        : std::nullopt;
+
+    switch (metric_) {
+      case raft::distance::DistanceType::L2Expanded:
+      case raft::distance::DistanceType::L2SqrtExpanded:
+      case raft::distance::DistanceType::L2Unexpanded:
+      case raft::distance::DistanceType::L2SqrtUnexpanded:
+        center_norms_ = make_device_mdarray<float>(handle, make_extents<uint32_t>(n_lists()));
+        break;
+      default: center_norms_ = std::nullopt;
+    }
+
     check_consistency();
   }
 
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 2296339a2a..287f0bc5f4 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <raft/spatial/knn/detail/ivf_pq_build.cuh>
 #include <raft/spatial/knn/detail/ivf_pq_search.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -65,9 +65,11 @@ namespace raft::neighbors::ivf_pq {
  * @return the constructed ivf-pq index
  */
 template <typename T, typename IdxT = uint32_t>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
+inline auto build(raft::device_resources const& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim) -> index<IdxT>
 {
   return raft::spatial::knn::ivf_pq::detail::build(handle, params, dataset, n_rows, dim);
 }
@@ -105,7 +107,7 @@ inline auto build(
  * @return the constructed extended ivf-pq index
  */
 template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
+inline auto extend(raft::device_resources const& handle,
                    const index<IdxT>& orig_index,
                    const T* new_vectors,
                    const IdxT* new_indices,
@@ -130,7 +132,7 @@ inline auto extend(const handle_t& handle,
  * @param n_rows the number of samples
  */
 template <typename T, typename IdxT>
-inline void extend(const handle_t& handle,
+inline void extend(raft::device_resources const& handle,
                    index<IdxT>* index,
                    const T* new_vectors,
                    const IdxT* new_indices,
@@ -182,7 +184,7 @@ inline void extend(const handle_t& handle,
  *           memory pool here to avoid memory allocations within search).
  */
 template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
+inline void search(raft::device_resources const& handle,
                    const search_params& params,
                    const index<IdxT>& index,
                    const T* queries,
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 825e2902c3..a6f71877f3 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -263,7 +263,7 @@ struct index : ann::index {
   ~index()                          = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
+  index(raft::device_resources const& handle,
         raft::distance::DistanceType metric,
         codebook_gen codebook_kind,
         uint32_t n_lists,
@@ -295,7 +295,7 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(const handle_t& handle,
+  index(raft::device_resources const& handle,
         const index_params& params,
         uint32_t dim,
         uint32_t n_nonempty_lists = 0)
@@ -314,10 +314,18 @@ struct index : ann::index {
    * Replace the content of the index with new uninitialized mdarrays to hold the indicated amount
    * of data.
    */
-  void allocate(const handle_t& handle, IdxT index_size)
+  void allocate(raft::device_resources const& handle, IdxT index_size)
   {
-    pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
-    indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    try {
+      pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
+      indices_    = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
+    } catch (std::bad_alloc& e) {
+      RAFT_FAIL(
+        "ivf-pq: failed to allocate a big enough index to hold all data (size: %zu). "
+        "Allocator exception: %s",
+        size_t(index_size),
+        e.what());
+    }
     if (index_size > 0) {
       thrust::fill_n(
         handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord);
@@ -434,7 +442,7 @@ struct index : ann::index {
 
   /** A helper function to determine the extents of an array enough to hold a given amount of data.
    */
-  auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
+  auto make_pq_dataset_extents(IdxT n_rows) const -> pq_dataset_extents
   {
     // how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
     auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
index 6f373c0a2f..4243d7e723 100644
--- a/cpp/include/raft/neighbors/refine.cuh
+++ b/cpp/include/raft/neighbors/refine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/matrix/matrix.cuh>
 #include <raft/neighbors/detail/refine.cuh>
@@ -68,7 +68,7 @@ namespace raft::neighbors {
  * @param[in] metric distance metric to use. Euclidean (L2) is used by default
  */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::handle_t const& handle,
+void refine(raft::device_resources const& handle,
             raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
             raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
             raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
@@ -90,7 +90,7 @@ void refine(raft::handle_t const& handle,
  * @param[in] metric distance metric to use. Euclidean (L2) is used by default
  */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::handle_t const& handle,
+void refine(raft::device_resources const& handle,
             raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
             raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
             raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 0511bbbf6c..d17467c8a7 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,8 @@
 #pragma once
 
 #include <raft/neighbors/specializations/ball_cover.cuh>
+#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
 #include <raft/neighbors/specializations/fused_l2_knn.cuh>
 #include <raft/neighbors/specializations/knn.cuh>
-
-#include <raft/neighbors/specializations/detail/ivf_pq_search.cuh>
-
+#include <raft/neighbors/specializations/refine.cuh>
 #endif
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
index f20d1adc35..d44cb7064f 100644
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,11 +26,11 @@ extern template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
 extern template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
 
 extern template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
 
 extern template void knn_query<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
   const float* query,
@@ -41,7 +41,7 @@ extern template void knn_query<std::int64_t, float, std::uint32_t>(
   float weight);
 
 extern template void all_knn_query<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
   std::int64_t* inds,
diff --git a/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp b/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
index 31df566b3f..c558ab8b56 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
+++ b/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 2>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -38,7 +38,7 @@ extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 2>
   std::uint32_t* dists_counter);
 
 extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -52,7 +52,7 @@ extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>
   std::uint32_t* post_dists_counter);
 
 extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -66,7 +66,7 @@ extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>
   std::uint32_t* dists_counter);
 
 extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
index 768a8739c3..ca5e4ac761 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,15 +50,15 @@ RAFT_INST_ALL_OUT_T(fp8u_t)
 #undef RAFT_INST_ALL_IDX_T
 #undef RAFT_INST_ALL_OUT_T
 
-#define RAFT_INST(T, IdxT)                                   \
-  extern template void search<T, IdxT>(const handle_t&,      \
-                                       const search_params&, \
-                                       const index<IdxT>&,   \
-                                       const T*,             \
-                                       uint32_t,             \
-                                       uint32_t,             \
-                                       IdxT*,                \
-                                       float*,               \
+#define RAFT_INST(T, IdxT)                                            \
+  extern template void search<T, IdxT>(raft::device_resources const&, \
+                                       const search_params&,          \
+                                       const index<IdxT>&,            \
+                                       const T*,                      \
+                                       uint32_t,                      \
+                                       uint32_t,                      \
+                                       IdxT*,                         \
+                                       float*,                        \
                                        rmm::mr::device_memory_resource*);
 
 RAFT_INST(float, int64_t);
diff --git a/cpp/include/raft/neighbors/specializations/knn.cuh b/cpp/include/raft/neighbors/specializations/knn.cuh
index bbbbf67d71..b1cfa278d6 100644
--- a/cpp/include/raft/neighbors/specializations/knn.cuh
+++ b/cpp/include/raft/neighbors/specializations/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 namespace raft {
 namespace spatial {
 namespace knn {
-extern template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
+extern template void brute_force_knn<long, float, int>(raft::device_resources const& handle,
                                                        std::vector<float*>& input,
                                                        std::vector<int>& sizes,
                                                        int D,
@@ -36,22 +36,23 @@ extern template void brute_force_knn<long, float, int>(raft::handle_t const& han
                                                        distance::DistanceType metric,
                                                        float metric_arg);
 
-extern template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& handle,
-                                                                std::vector<float*>& input,
-                                                                std::vector<unsigned int>& sizes,
-                                                                unsigned int D,
-                                                                float* search_items,
-                                                                unsigned int n,
-                                                                long* res_I,
-                                                                float* res_D,
-                                                                unsigned int k,
-                                                                bool rowMajorIndex,
-                                                                bool rowMajorQuery,
-                                                                std::vector<long>* translations,
-                                                                distance::DistanceType metric,
-                                                                float metric_arg);
+extern template void brute_force_knn<long, float, unsigned int>(
+  raft::device_resources const& handle,
+  std::vector<float*>& input,
+  std::vector<unsigned int>& sizes,
+  unsigned int D,
+  float* search_items,
+  unsigned int n,
+  long* res_I,
+  float* res_D,
+  unsigned int k,
+  bool rowMajorIndex,
+  bool rowMajorQuery,
+  std::vector<long>* translations,
+  distance::DistanceType metric,
+  float metric_arg);
 
-extern template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle,
+extern template void brute_force_knn<uint32_t, float, int>(raft::device_resources const& handle,
                                                            std::vector<float*>& input,
                                                            std::vector<int>& sizes,
                                                            int D,
@@ -67,7 +68,7 @@ extern template void brute_force_knn<uint32_t, float, int>(raft::handle_t const&
                                                            float metric_arg);
 
 extern template void brute_force_knn<uint32_t, float, unsigned int>(
-  raft::handle_t const& handle,
+  raft::device_resources const& handle,
   std::vector<float*>& input,
   std::vector<unsigned int>& sizes,
   unsigned int D,
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
new file mode 100644
index 0000000000..71e83a26f3
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/refine.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+#ifdef RAFT_INST
+#undef RAFT_INST
+#endif
+
+#define RAFT_INST(T, IdxT)                                                         \
+  extern template void refine<IdxT, T, float, uint64_t>(                           \
+    raft::device_resources const& handle,                                          \
+    raft::device_matrix_view<const T, uint64_t, row_major> dataset,                \
+    raft::device_matrix_view<const T, uint64_t, row_major> queries,                \
+    raft::device_matrix_view<const IdxT, uint64_t, row_major> neighbor_candidates, \
+    raft::device_matrix_view<IdxT, uint64_t, row_major> indices,                   \
+    raft::device_matrix_view<float, uint64_t, row_major> distances,                \
+    distance::DistanceType metric);                                                \
+                                                                                   \
+  extern template void refine<IdxT, T, float, uint64_t>(                           \
+    raft::device_resources const& handle,                                          \
+    raft::host_matrix_view<const T, uint64_t, row_major> dataset,                  \
+    raft::host_matrix_view<const T, uint64_t, row_major> queries,                  \
+    raft::host_matrix_view<const IdxT, uint64_t, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<IdxT, uint64_t, row_major> indices,                     \
+    raft::host_matrix_view<float, uint64_t, row_major> distances,                  \
+    distance::DistanceType metric);
+
+RAFT_INST(float, uint64_t);
+RAFT_INST(uint8_t, uint64_t);
+RAFT_INST(int8_t, uint64_t);
+
+#undef RAFT_INST
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 68c2d56599..b37dabb366 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,8 @@
 #pragma once
 
 #include "permute.cuh"
-#include <raft/linalg/unary_op.cuh>
+#include <raft/core/handle.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/random/rng_device.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -39,16 +40,16 @@ void generate_labels(IdxT* labels,
                      raft::random::RngState& r,
                      cudaStream_t stream)
 {
+  raft::handle_t handle(stream);
   IdxT a, b;
   raft::random::affine_transform_params(r, n_clusters, a, b);
-  auto op = [=] __device__(IdxT * ptr, IdxT idx) {
-    if (shuffle) { idx = IdxT((a * int64_t(idx)) + b); }
+  auto op = [=] __device__(IdxT idx) {
+    if (shuffle) { idx = static_cast<IdxT>((a * int64_t(idx)) + b); }
     idx %= n_clusters;
-    // in the unlikely case of n_clusters > n_rows, make sure that the writes
-    // do not go out-of-bounds
-    if (idx < n_rows) { *ptr = idx; }
+    return idx;
   };
-  raft::linalg::writeOnlyUnaryOp<IdxT, decltype(op), IdxT>(labels, n_rows, op, stream);
+  auto labels_view = raft::make_device_vector_view<IdxT, IdxT>(labels, n_rows);
+  linalg::map_offset(handle, labels_view, op);
 }
 
 template <typename DataT, typename IdxT>
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index cb0949c458..01d97d496d 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/init.cuh>
@@ -44,15 +44,15 @@ static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_s
   IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
     DataT sval     = static_cast<DataT>(tid) / rank;
-    DataT low_rank = ((DataT)1.0 - tail_strength) * raft::myExp(-sval * sval);
-    DataT tail     = tail_strength * raft::myExp((DataT)-0.1 * sval);
+    DataT low_rank = ((DataT)1.0 - tail_strength) * raft::exp(-sval * sval);
+    DataT tail     = tail_strength * raft::exp((DataT)-0.1 * sval);
     out[tid]       = low_rank + tail;
   }
 }
 
 /* Internal auxiliary function to generate a low-rank matrix */
 template <typename DataT, typename IdxT>
-static void _make_low_rank_matrix(const raft::handle_t& handle,
+static void _make_low_rank_matrix(raft::device_resources const& handle,
                                   DataT* out,
                                   IdxT n_rows,
                                   IdxT n_cols,
@@ -143,7 +143,7 @@ static __global__ void _gather2d_kernel(
 }
 
 template <typename DataT, typename IdxT>
-void make_regression_caller(const raft::handle_t& handle,
+void make_regression_caller(raft::device_resources const& handle,
                             DataT* out,
                             DataT* values,
                             IdxT n_rows,
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 750ebf2ef1..8b77608e62 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <memory>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -139,13 +139,15 @@ class multi_variable_gaussian_impl {
   int *info, Lwork, info_h;
   syevjInfo_t syevj_params = NULL;
   curandGenerator_t gen;
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
   cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
   bool deinitilized      = false;
 
  public:  // functions
   multi_variable_gaussian_impl() = delete;
-  multi_variable_gaussian_impl(const raft::handle_t& handle, const int dim, Decomposer method)
+  multi_variable_gaussian_impl(raft::device_resources const& handle,
+                               const int dim,
+                               Decomposer method)
     : handle(handle), dim(dim), method(method)
   {
     auto cusolverHandle = handle.get_cusolver_dn_handle();
@@ -297,7 +299,7 @@ class multi_variable_gaussian_setup_token;
 
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method);
@@ -313,7 +315,7 @@ template <typename ValueType>
 class multi_variable_gaussian_setup_token {
   template <typename T>
   friend multi_variable_gaussian_setup_token<T> build_multi_variable_gaussian_token_impl(
-    const raft::handle_t& handle,
+    raft::device_resources const& handle,
     rmm::mr::device_memory_resource& mem_resource,
     const int dim,
     const multi_variable_gaussian_decomposition_method method);
@@ -340,7 +342,7 @@ class multi_variable_gaussian_setup_token {
 
   // Constructor, only for use by friend functions.
   // Hiding this will let us change the implementation in the future.
-  multi_variable_gaussian_setup_token(const raft::handle_t& handle,
+  multi_variable_gaussian_setup_token(raft::device_resources const& handle,
                                       rmm::mr::device_memory_resource& mem_resource,
                                       const int dim,
                                       const multi_variable_gaussian_decomposition_method method)
@@ -397,7 +399,7 @@ class multi_variable_gaussian_setup_token {
 
  private:
   std::unique_ptr<multi_variable_gaussian_impl<ValueType>> impl_;
-  const raft::handle_t& handle_;
+  raft::device_resources const& handle_;
   rmm::mr::device_memory_resource& mem_resource_;
   int dim_ = 0;
 
@@ -412,7 +414,7 @@ class multi_variable_gaussian_setup_token {
 
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method)
@@ -432,7 +434,7 @@ void compute_multi_variable_gaussian_impl(
 
 template <typename ValueType>
 void compute_multi_variable_gaussian_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   std::optional<raft::device_vector_view<const ValueType, int>> x,
   raft::device_matrix_view<ValueType, int, raft::col_major> P,
@@ -453,7 +455,7 @@ class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
   // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
 
   multi_variable_gaussian() = delete;
-  multi_variable_gaussian(const raft::handle_t& handle,
+  multi_variable_gaussian(raft::device_resources const& handle,
                           const int dim,
                           typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
     : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index 5ce7e909ee..b5e0610405 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "rmat_rectangular_generator_types.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -206,7 +206,7 @@ void rmat_rectangular_gen_caller(IdxT* out,
  * @param[in]  c_scale 2^c_scale represents the number of destination nodes
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen_impl(const raft::handle_t& handle,
+void rmat_rectangular_gen_impl(raft::device_resources const& handle,
                                raft::random::RngState& r,
                                raft::device_vector_view<const ProbT, IdxT> theta,
                                raft::random::detail::rmat_rectangular_gen_output<IdxT> output,
@@ -259,7 +259,7 @@ void rmat_rectangular_gen_impl(const raft::handle_t& handle,
  * `theta` parameter.
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen_impl(const raft::handle_t& handle,
+void rmat_rectangular_gen_impl(raft::device_resources const& handle,
                                raft::random::RngState& r,
                                raft::random::detail::rmat_rectangular_gen_output<IdxT> output,
                                ProbT a,
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index 6c75a4fa78..7f994fb07f 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,10 +143,10 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type
 {
   constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
   constexpr Type minus2 = -Type(2.0);
-  Type R                = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type R                = raft::sqrt(minus2 * raft::log(val1));
   Type theta            = twoPi * val2;
   Type s, c;
-  raft::mySinCos(theta, s, c);
+  raft::sincos(theta, &s, &c);
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
@@ -323,7 +323,7 @@ DI void custom_next(
     gen.next(res);
   } while (res == OutType(0.0));
 
-  *val = params.mu - params.beta * raft::myLog(-raft::myLog(res));
+  *val = params.mu - params.beta * raft::log(-raft::log(res));
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -340,8 +340,8 @@ DI void custom_next(GenType& gen,
 
   gen.next(res2);
   box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
-  *val       = raft::myExp(res1);
-  *(val + 1) = raft::myExp(res2);
+  *val       = raft::exp(res1);
+  *(val + 1) = raft::exp(res2);
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -358,7 +358,7 @@ DI void custom_next(GenType& gen,
   } while (res == OutType(0.0));
 
   constexpr OutType one = (OutType)1.0;
-  *val                  = params.mu - params.scale * raft::myLog(one / res - one);
+  *val                  = params.mu - params.scale * raft::log(one / res - one);
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -371,7 +371,7 @@ DI void custom_next(GenType& gen,
   OutType res;
   gen.next(res);
   constexpr OutType one = (OutType)1.0;
-  *val                  = -raft::myLog(one - res) / params.lambda;
+  *val                  = -raft::log(one - res) / params.lambda;
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -386,7 +386,7 @@ DI void custom_next(GenType& gen,
 
   constexpr OutType one = (OutType)1.0;
   constexpr OutType two = (OutType)2.0;
-  *val                  = raft::mySqrt(-two * raft::myLog(one - res)) * params.sigma;
+  *val                  = raft::sqrt(-two * raft::log(one - res)) * params.sigma;
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -409,9 +409,9 @@ DI void custom_next(GenType& gen,
   // The <= comparison here means, number of samples going in `if` branch are more by 1 than `else`
   // branch. However it does not matter as for 0.5 both branches evaluate to same result.
   if (res <= oneHalf) {
-    out = params.mu + params.scale * raft::myLog(two * res);
+    out = params.mu + params.scale * raft::log(two * res);
   } else {
-    out = params.mu - params.scale * raft::myLog(two * (one - res));
+    out = params.mu - params.scale * raft::log(two * (one - res));
   }
   *val = out;
 }
@@ -424,7 +424,7 @@ DI void custom_next(
   gen.next(res);
   params.inIdxPtr[idx]  = idx;
   constexpr OutType one = (OutType)1.0;
-  auto exp              = -raft::myLog(one - res);
+  auto exp              = -raft::log(one - res);
   if (params.wts != nullptr) {
     *val = exp / params.wts[idx];
   } else {
diff --git a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
index f9b55dd9d0..362c844fb3 100644
--- a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
+++ b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include "rng_device.cuh"
 
 #include <curand_kernel.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/detail/cub_wrappers.cuh>
@@ -259,7 +259,7 @@ class RngImpl {
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
   METHOD_DEPR(sampleWithoutReplacement)
-  void sampleWithoutReplacement(const raft::handle_t& handle,
+  void sampleWithoutReplacement(raft::device_resources const& handle,
                                 DataT* out,
                                 IdxT* outIdx,
                                 const DataT* in,
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 4f6ddaa2b2..7aa0362f6d 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ void make_blobs(DataT* out,
  */
 template <typename DataT, typename IdxT, typename layout>
 void make_blobs(
-  raft::handle_t const& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<DataT, IdxT, layout> out,
   raft::device_vector_view<IdxT, IdxT> labels,
   IdxT n_clusters                                                        = 5,
diff --git a/cpp/include/raft/random/make_regression.cuh b/cpp/include/raft/random/make_regression.cuh
index e203de4ade..df7dea3156 100644
--- a/cpp/include/raft/random/make_regression.cuh
+++ b/cpp/include/raft/random/make_regression.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ namespace raft::random {
  * @param[in]   type            Random generator type
  */
 template <typename DataT, typename IdxT>
-void make_regression(const raft::handle_t& handle,
+void make_regression(raft::device_resources const& handle,
                      DataT* out,
                      DataT* values,
                      IdxT n_rows,
@@ -138,7 +138,7 @@ void make_regression(const raft::handle_t& handle,
  * @param[in]   type            Random generator type
  */
 template <typename DataT, typename IdxT>
-void make_regression(const raft::handle_t& handle,
+void make_regression(raft::device_resources const& handle,
                      raft::device_matrix_view<DataT, IdxT, raft::row_major> out,
                      raft::device_matrix_view<DataT, IdxT, raft::row_major> values,
                      IdxT n_informative,
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
index 37ea58c0fb..91a7695f2c 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace raft::random {
  */
 
 template <typename ValueType>
-void multi_variable_gaussian(const raft::handle_t& handle,
+void multi_variable_gaussian(raft::device_resources const& handle,
                              rmm::mr::device_memory_resource& mem_resource,
                              std::optional<raft::device_vector_view<const ValueType, int>> x,
                              raft::device_matrix_view<ValueType, int, raft::col_major> P,
@@ -41,7 +41,7 @@ void multi_variable_gaussian(const raft::handle_t& handle,
 }
 
 template <typename ValueType>
-void multi_variable_gaussian(const raft::handle_t& handle,
+void multi_variable_gaussian(raft::device_resources const& handle,
                              std::optional<raft::device_vector_view<const ValueType, int>> x,
                              raft::device_matrix_view<ValueType, int, raft::col_major> P,
                              raft::device_matrix_view<ValueType, int, raft::col_major> X,
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
index b532cb584d..f84b603549 100644
--- a/cpp/include/raft/random/permute.cuh
+++ b/cpp/include/raft/random/permute.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <type_traits>
 
 namespace raft::random {
@@ -95,7 +95,7 @@ using perms_out_view_t = typename perms_out_view<T, InputOutputValueType, IdxTyp
  *   then we recommend Knuth Shuffle.
  */
 template <typename InputOutputValueType, typename IntType, typename IdxType, typename Layout>
-void permute(const raft::handle_t& handle,
+void permute(raft::device_resources const& handle,
              raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in,
              std::optional<raft::device_vector_view<IntType, IdxType>> permsOut,
              std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
@@ -142,7 +142,7 @@ template <typename InputOutputValueType,
           typename Layout,
           typename PermsOutType,
           typename OutType>
-void permute(const raft::handle_t& handle,
+void permute(raft::device_resources const& handle,
              raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in,
              PermsOutType&& permsOut,
              OutType&& out)
diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh
index cd9acda999..d578794d31 100644
--- a/cpp/include/raft/random/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ namespace raft::random {
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_vector_view<const ProbT, IdxT> theta,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
@@ -102,7 +102,7 @@ void rmat_rectangular_gen(
  * @pre `out_src.extent(0) == out_dst.extent(0)` is `true`
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen(const raft::handle_t& handle,
+void rmat_rectangular_gen(raft::device_resources const& handle,
                           raft::random::RngState& r,
                           raft::device_vector_view<const ProbT, IdxT> theta,
                           raft::device_vector_view<IdxT, IdxT> out_src,
@@ -125,7 +125,7 @@ void rmat_rectangular_gen(const raft::handle_t& handle,
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_vector_view<const ProbT, IdxT> theta,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
@@ -152,7 +152,7 @@ void rmat_rectangular_gen(
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
   raft::device_vector_view<IdxT, IdxT> out_src,
@@ -179,7 +179,7 @@ void rmat_rectangular_gen(
  * @pre `out_src.extent(0) == out_dst.extent(0)` is `true`
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen(const raft::handle_t& handle,
+void rmat_rectangular_gen(raft::device_resources const& handle,
                           raft::random::RngState& r,
                           raft::device_vector_view<IdxT, IdxT> out_src,
                           raft::device_vector_view<IdxT, IdxT> out_dst,
@@ -204,7 +204,7 @@ void rmat_rectangular_gen(const raft::handle_t& handle,
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::random::RngState& r,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
   ProbT a,
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 9469c393f1..d03975d0db 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <cassert>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <type_traits>
 #include <variant>
 
@@ -41,7 +41,7 @@ namespace raft::random {
  * @param[in] end end of the range
  */
 template <typename OutputValueType, typename IndexType>
-void uniform(const raft::handle_t& handle,
+void uniform(raft::device_resources const& handle,
              RngState& rng_state,
              raft::device_vector_view<OutputValueType, IndexType> out,
              OutputValueType start,
@@ -63,7 +63,7 @@ void uniform(const raft::handle_t& handle,
  * @param[in] end end of the range
  */
 template <typename OutType, typename LenType = int>
-void uniform(const raft::handle_t& handle,
+void uniform(raft::device_resources const& handle,
              RngState& rng_state,
              OutType* ptr,
              LenType len,
@@ -86,7 +86,7 @@ void uniform(const raft::handle_t& handle,
  * @param[in] end end of the range
  */
 template <typename OutputValueType, typename IndexType>
-void uniformInt(const raft::handle_t& handle,
+void uniformInt(raft::device_resources const& handle,
                 RngState& rng_state,
                 raft::device_vector_view<OutputValueType, IndexType> out,
                 OutputValueType start,
@@ -114,7 +114,7 @@ void uniformInt(const raft::handle_t& handle,
  * @param[in] end end of the range
  */
 template <typename OutType, typename LenType = int>
-void uniformInt(const raft::handle_t& handle,
+void uniformInt(raft::device_resources const& handle,
                 RngState& rng_state,
                 OutType* ptr,
                 LenType len,
@@ -138,7 +138,7 @@ void uniformInt(const raft::handle_t& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void normal(const raft::handle_t& handle,
+void normal(raft::device_resources const& handle,
             RngState& rng_state,
             raft::device_vector_view<OutputValueType, IndexType> out,
             OutputValueType mu,
@@ -160,7 +160,7 @@ void normal(const raft::handle_t& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename OutType, typename LenType = int>
-void normal(const raft::handle_t& handle,
+void normal(raft::device_resources const& handle,
             RngState& rng_state,
             OutType* ptr,
             LenType len,
@@ -183,7 +183,7 @@ void normal(const raft::handle_t& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void normalInt(const raft::handle_t& handle,
+void normalInt(raft::device_resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                OutputValueType mu,
@@ -212,7 +212,7 @@ void normalInt(const raft::handle_t& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename IntType, typename LenType = int>
-void normalInt(const raft::handle_t& handle,
+void normalInt(raft::device_resources const& handle,
                RngState& rng_state,
                IntType* ptr,
                LenType len,
@@ -244,7 +244,7 @@ void normalInt(const raft::handle_t& handle,
  */
 template <typename OutputValueType, typename IndexType>
 void normalTable(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   RngState& rng_state,
   raft::device_vector_view<const OutputValueType, IndexType> mu_vec,
   std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma,
@@ -307,7 +307,7 @@ void normalTable(
  * @param[in] sigma scalar sigma to be used if 'sigma_vec' is nullptr
  */
 template <typename OutType, typename LenType = int>
-void normalTable(const raft::handle_t& handle,
+void normalTable(raft::device_resources const& handle,
                  RngState& rng_state,
                  OutType* ptr,
                  LenType n_rows,
@@ -332,7 +332,7 @@ void normalTable(const raft::handle_t& handle,
  * @param[out] out the output vector
  */
 template <typename OutputValueType, typename IndexType>
-void fill(const raft::handle_t& handle,
+void fill(raft::device_resources const& handle,
           RngState& rng_state,
           OutputValueType val,
           raft::device_vector_view<OutputValueType, IndexType> out)
@@ -352,7 +352,8 @@ void fill(const raft::handle_t& handle,
  * @param[in] val value to be filled
  */
 template <typename OutType, typename LenType = int>
-void fill(const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
+void fill(
+  raft::device_resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
 {
   detail::fill(rng_state, ptr, len, val, handle.get_stream());
 }
@@ -371,7 +372,7 @@ void fill(const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenTy
  * @param[in] prob coin-toss probability for heads
  */
 template <typename OutputValueType, typename IndexType, typename Type>
-void bernoulli(const raft::handle_t& handle,
+void bernoulli(raft::device_resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                Type prob)
@@ -394,7 +395,7 @@ void bernoulli(const raft::handle_t& handle,
  */
 template <typename Type, typename OutType = bool, typename LenType = int>
 void bernoulli(
-  const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
+  raft::device_resources const& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
 {
   detail::bernoulli(rng_state, ptr, len, prob, handle.get_stream());
 }
@@ -412,7 +413,7 @@ void bernoulli(
  * @param[in] scale scaling factor
  */
 template <typename OutputValueType, typename IndexType>
-void scaled_bernoulli(const raft::handle_t& handle,
+void scaled_bernoulli(raft::device_resources const& handle,
                       RngState& rng_state,
                       raft::device_vector_view<OutputValueType, IndexType> out,
                       OutputValueType prob,
@@ -435,7 +436,7 @@ void scaled_bernoulli(const raft::handle_t& handle,
  * @param[in] scale scaling factor
  */
 template <typename OutType, typename LenType = int>
-void scaled_bernoulli(const raft::handle_t& handle,
+void scaled_bernoulli(raft::device_resources const& handle,
                       RngState& rng_state,
                       OutType* ptr,
                       LenType len,
@@ -459,7 +460,7 @@ void scaled_bernoulli(const raft::handle_t& handle,
  * @note https://en.wikipedia.org/wiki/Gumbel_distribution
  */
 template <typename OutputValueType, typename IndexType = int>
-void gumbel(const raft::handle_t& handle,
+void gumbel(raft::device_resources const& handle,
             RngState& rng_state,
             raft::device_vector_view<OutputValueType, IndexType> out,
             OutputValueType mu,
@@ -482,7 +483,7 @@ void gumbel(const raft::handle_t& handle,
  * @note https://en.wikipedia.org/wiki/Gumbel_distribution
  */
 template <typename OutType, typename LenType = int>
-void gumbel(const raft::handle_t& handle,
+void gumbel(raft::device_resources const& handle,
             RngState& rng_state,
             OutType* ptr,
             LenType len,
@@ -505,7 +506,7 @@ void gumbel(const raft::handle_t& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void lognormal(const raft::handle_t& handle,
+void lognormal(raft::device_resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                OutputValueType mu,
@@ -527,7 +528,7 @@ void lognormal(const raft::handle_t& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutType, typename LenType = int>
-void lognormal(const raft::handle_t& handle,
+void lognormal(raft::device_resources const& handle,
                RngState& rng_state,
                OutType* ptr,
                LenType len,
@@ -550,7 +551,7 @@ void lognormal(const raft::handle_t& handle,
  * @param[in] scale scale value
  */
 template <typename OutputValueType, typename IndexType = int>
-void logistic(const raft::handle_t& handle,
+void logistic(raft::device_resources const& handle,
               RngState& rng_state,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType mu,
@@ -572,7 +573,7 @@ void logistic(const raft::handle_t& handle,
  * @param[in] scale scale value
  */
 template <typename OutType, typename LenType = int>
-void logistic(const raft::handle_t& handle,
+void logistic(raft::device_resources const& handle,
               RngState& rng_state,
               OutType* ptr,
               LenType len,
@@ -594,7 +595,7 @@ void logistic(const raft::handle_t& handle,
  * @param[in] lambda the exponential distribution's lambda parameter
  */
 template <typename OutputValueType, typename IndexType>
-void exponential(const raft::handle_t& handle,
+void exponential(raft::device_resources const& handle,
                  RngState& rng_state,
                  raft::device_vector_view<OutputValueType, IndexType> out,
                  OutputValueType lambda)
@@ -614,8 +615,11 @@ void exponential(const raft::handle_t& handle,
  * @param[in] lambda the exponential distribution's lambda parameter
  */
 template <typename OutType, typename LenType = int>
-void exponential(
-  const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, OutType lambda)
+void exponential(raft::device_resources const& handle,
+                 RngState& rng_state,
+                 OutType* ptr,
+                 LenType len,
+                 OutType lambda)
 {
   detail::exponential(rng_state, ptr, len, lambda, handle.get_stream());
 }
@@ -632,7 +636,7 @@ void exponential(
  * @param[in] sigma the distribution's sigma parameter
  */
 template <typename OutputValueType, typename IndexType>
-void rayleigh(const raft::handle_t& handle,
+void rayleigh(raft::device_resources const& handle,
               RngState& rng_state,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType sigma)
@@ -652,8 +656,11 @@ void rayleigh(const raft::handle_t& handle,
  * @param[in] sigma the distribution's sigma parameter
  */
 template <typename OutType, typename LenType = int>
-void rayleigh(
-  const raft::handle_t& handle, RngState& rng_state, OutType* ptr, LenType len, OutType sigma)
+void rayleigh(raft::device_resources const& handle,
+              RngState& rng_state,
+              OutType* ptr,
+              LenType len,
+              OutType sigma)
 {
   detail::rayleigh(rng_state, ptr, len, sigma, handle.get_stream());
 }
@@ -671,7 +678,7 @@ void rayleigh(
  * @param[in] scale the scale
  */
 template <typename OutputValueType, typename IndexType>
-void laplace(const raft::handle_t& handle,
+void laplace(raft::device_resources const& handle,
              RngState& rng_state,
              raft::device_vector_view<OutputValueType, IndexType> out,
              OutputValueType mu,
@@ -693,7 +700,7 @@ void laplace(const raft::handle_t& handle,
  * @param[in] scale the scale
  */
 template <typename OutType, typename LenType = int>
-void laplace(const raft::handle_t& handle,
+void laplace(raft::device_resources const& handle,
              RngState& rng_state,
              OutType* ptr,
              LenType len,
@@ -709,10 +716,10 @@ void laplace(const raft::handle_t& handle,
  * Usage example:
  * @code{.cpp}
  *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/random/rng.cuh>
  *
- *  raft::handle_t handle;
+ *  raft::raft::device_resources handle;
  *  ...
  *  raft::random::RngState rng(seed);
  *  auto indices = raft::make_device_vector<int>(handle, n_samples);
@@ -730,7 +737,7 @@ void laplace(const raft::handle_t& handle,
  */
 template <typename OutType, typename WeightType, typename IndexType>
 std::enable_if_t<std::is_integral_v<OutType>> discrete(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   RngState& rng_state,
   raft::device_vector_view<OutType, IndexType> out,
   raft::device_vector_view<const WeightType, IndexType> weights)
@@ -763,7 +770,7 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(
  * @param[in] len input array length
  */
 template <typename DataT, typename WeightsT, typename IdxT = int>
-void sampleWithoutReplacement(const raft::handle_t& handle,
+void sampleWithoutReplacement(raft::device_resources const& handle,
                               RngState& rng_state,
                               DataT* out,
                               IdxT* outIdx,
@@ -1099,7 +1106,7 @@ class DEPR Rng : public detail::RngImpl {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t& handle,
+  void sampleWithoutReplacement(raft::device_resources const& handle,
                                 DataT* out,
                                 IdxT* outIdx,
                                 const DataT* in,
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
index e4428c28d6..8998db98ae 100644
--- a/cpp/include/raft/random/sample_without_replacement.cuh
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <cassert>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <type_traits>
 #include <variant>
 
@@ -94,7 +94,7 @@ using weight_t = typename weight_alias<T>::type;
  *   equals the number of inputs `in.extent(0)`.
  */
 template <typename DataT, typename IdxT, typename WeightsVectorType, class OutIndexVectorType>
-void sample_without_replacement(const raft::handle_t& handle,
+void sample_without_replacement(raft::device_resources const& handle,
                                 RngState& rng_state,
                                 raft::device_vector_view<const DataT, IdxT> in,
                                 WeightsVectorType&& weights_opt,
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index cbfe12fd23..440e6901c6 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@
 
 #include <raft/solver/linear_assignment_types.hpp>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/solver/detail/lap_kernels.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
@@ -98,7 +98,7 @@ inline void calculateRectangularDims(
 }
 
 template <typename vertex_t, typename weight_t>
-inline void initialReduction(raft::handle_t const& handle,
+inline void initialReduction(raft::device_resources const& handle,
                              weight_t const* d_costs,
                              Vertices<vertex_t, weight_t>& d_vertices_dev,
                              int SP,
@@ -125,7 +125,7 @@ inline void initialReduction(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void computeInitialAssignments(raft::handle_t const& handle,
+inline void computeInitialAssignments(raft::device_resources const& handle,
                                       weight_t const* d_costs,
                                       Vertices<vertex_t, weight_t>& d_vertices,
                                       int SP,
@@ -164,7 +164,7 @@ inline void computeInitialAssignments(raft::handle_t const& handle,
 
 // Function for finding row cover on individual devices.
 template <typename vertex_t, typename weight_t>
-inline int computeRowCovers(raft::handle_t const& handle,
+inline int computeRowCovers(raft::device_resources const& handle,
                             Vertices<vertex_t, weight_t>& d_vertices,
                             VertexData<vertex_t>& d_row_data,
                             VertexData<vertex_t>& d_col_data,
@@ -198,7 +198,7 @@ inline int computeRowCovers(raft::handle_t const& handle,
 
 // Function for covering the zeros in uncovered rows and expanding the frontier.
 template <typename vertex_t, typename weight_t>
-inline void coverZeroAndExpand(raft::handle_t const& handle,
+inline void coverZeroAndExpand(raft::device_resources const& handle,
                                weight_t const* d_costs_dev,
                                vertex_t const* d_rows_csr_neighbors,
                                vertex_t const* d_rows_csr_ptrs,
@@ -230,7 +230,7 @@ inline void coverZeroAndExpand(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
+inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
                                    weight_t const* d_costs_dev,
                                    Vertices<vertex_t, weight_t>& d_vertices_dev,
                                    VertexData<vertex_t>& d_row_data_dev,
@@ -310,7 +310,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
 // Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending
 // on the presence of uncovered zeros.
 template <typename vertex_t, typename weight_t>
-inline void executeZeroCover(raft::handle_t const& handle,
+inline void executeZeroCover(raft::device_resources const& handle,
                              weight_t const* d_costs_dev,
                              Vertices<vertex_t, weight_t>& d_vertices_dev,
                              VertexData<vertex_t>& d_row_data_dev,
@@ -329,7 +329,7 @@ inline void executeZeroCover(raft::handle_t const& handle,
 
 // Function for executing reverse pass of the maximum matching.
 template <typename vertex_t>
-inline void reversePass(raft::handle_t const& handle,
+inline void reversePass(raft::device_resources const& handle,
                         VertexData<vertex_t>& d_row_data_dev,
                         VertexData<vertex_t>& d_col_data_dev,
                         int SP,
@@ -385,7 +385,7 @@ inline void reversePass(raft::handle_t const& handle,
 
 // Function for executing augmentation pass of the maximum matching.
 template <typename vertex_t, typename weight_t>
-inline void augmentationPass(raft::handle_t const& handle,
+inline void augmentationPass(raft::device_resources const& handle,
                              Vertices<vertex_t, weight_t>& d_vertices_dev,
                              VertexData<vertex_t>& d_row_data_dev,
                              VertexData<vertex_t>& d_col_data_dev,
@@ -448,7 +448,7 @@ inline void augmentationPass(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void dualUpdate(raft::handle_t const& handle,
+inline void dualUpdate(raft::device_resources const& handle,
                        Vertices<vertex_t, weight_t>& d_vertices_dev,
                        VertexData<vertex_t>& d_row_data_dev,
                        VertexData<vertex_t>& d_col_data_dev,
@@ -493,7 +493,7 @@ inline void dualUpdate(raft::handle_t const& handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValDual(raft::handle_t const& handle,
+inline void calcObjValDual(raft::device_resources const& handle,
                            weight_t* d_obj_val,
                            Vertices<vertex_t, weight_t>& d_vertices_dev,
                            int SP,
@@ -513,7 +513,7 @@ inline void calcObjValDual(raft::handle_t const& handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValPrimal(raft::handle_t const& handle,
+inline void calcObjValPrimal(raft::device_resources const& handle,
                              weight_t* d_obj_val,
                              weight_t const* d_costs,
                              vertex_t const* d_row_assignments,
diff --git a/cpp/include/raft/solver/detail/lap_kernels.cuh b/cpp/include/raft/solver/detail/lap_kernels.cuh
index d66a9d72d5..69930a1460 100644
--- a/cpp/include/raft/solver/detail/lap_kernels.cuh
+++ b/cpp/include/raft/solver/detail/lap_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@
 
 #include "../linear_assignment_types.hpp"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <thrust/execution_policy.h>
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 3e17b557f2..7904c04ede 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,7 +28,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/execution_policy.h>
@@ -61,7 +61,7 @@ class LinearAssignmentProblem {
   Vertices<vertex_t, weight_t> d_vertices_dev;
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
-  raft::handle_t const& handle_;
+  raft::device_resources const& handle_;
   rmm::device_uvector<int> row_covers_v;
   rmm::device_uvector<int> col_covers_v;
   rmm::device_uvector<weight_t> row_duals_v;
@@ -84,7 +84,7 @@ class LinearAssignmentProblem {
    * @param batchsize
    * @param epsilon
    */
-  LinearAssignmentProblem(raft::handle_t const& handle,
+  LinearAssignmentProblem(raft::device_resources const& handle,
                           vertex_t size,
                           vertex_t batchsize,
                           weight_t epsilon)
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index abdacdc426..09f4135a51 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t& handle,
+void coo_to_csr(raft::device_resources const& handle,
                 const int* srcRows,
                 const int* srcCols,
                 const value_t* srcVals,
@@ -90,7 +90,7 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
  *                         number of non-zeros in adj.
  */
 template <typename index_t = int>
-void adj_to_csr(const raft::handle_t& handle,
+void adj_to_csr(raft::device_resources const& handle,
                 const bool* adj,         // Row-major adjacency matrix
                 const index_t* row_ind,  // Precomputed row indices
                 index_t num_rows,        // # rows of adj
diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
index 4549fbe343..87c534d7b8 100644
--- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <cooperative_groups.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/vectorized.cuh>
@@ -129,7 +129,7 @@ __global__ void __launch_bounds__(adj_to_csr_tpb)
  *                         number of non-zeros in adj.
  */
 template <typename index_t = int>
-void adj_to_csr(const raft::handle_t& handle,
+void adj_to_csr(raft::device_resources const& handle,
                 const bool* adj,         // row-major adjacency matrix
                 const index_t* row_ind,  // precomputed row indices
                 index_t num_rows,        // # rows of adj
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index acb77de358..3f155854c0 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <cusparse_v2.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -44,7 +44,7 @@ namespace convert {
 namespace detail {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t& handle,
+void coo_to_csr(raft::device_resources const& handle,
                 const int* srcRows,
                 const int* srcCols,
                 const value_t* srcVals,
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index 3bb2db7902..6ae6874466 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -779,7 +779,7 @@ cusparseStatus_t cusparsegemmi(  // NOLINT
   auto return_value =
     cusparsespmm(handle, opB, opA, alpha, matB, matA, beta, matC, alg, ext_buf, stream);
 
-  raft::handle_t rhandle;
+  raft::device_resources rhandle;
   raft::linalg::transpose(rhandle, CT.data(), C, n, m, stream);
   // destroy matrix/vector descriptors
   CUSPARSE_CHECK(cusparseDestroyDnMat(matA));
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index a69352d74b..1e5aeb5210 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 namespace raft {
 namespace sparse {
@@ -24,7 +24,7 @@ namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
-  distances_config_t(const raft::handle_t& handle_) : handle(handle_) {}
+  distances_config_t(raft::device_resources const& handle_) : handle(handle_) {}
 
   // left side
   value_idx a_nrows;
@@ -42,7 +42,7 @@ struct distances_config_t {
   value_idx* b_indices;
   value_t* b_data;
 
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
 };
 
 template <typename value_t>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 3c852235df..2f165b3ff2 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ __global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
   value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
   value_t R_denom = n * R_l2 - (R_l1 * R_l1);
 
-  value_t val = 1 - (numer / sqrt(Q_denom * R_denom));
+  value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom));
 
   // correct for small instabilities
   C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
@@ -292,7 +292,7 @@ class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, v
       this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
-        return sqrt(abs(input) * neg);
+        return raft::sqrt(abs(input) * neg);
       },
       this->config_->handle.get_stream());
   }
@@ -379,7 +379,7 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
                config_->b_nrows,
                config_->handle.get_stream(),
                [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-                 value_t norms = sqrt(q_norm) * sqrt(r_norm);
+                 value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm);
                  // deal with potential for 0 in denominator by forcing 0/1 instead
                  value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
 
@@ -429,7 +429,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
       out_dists,
       *config_,
       coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); },
+      [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); },
       raft::add_op(),
       raft::atomic_add_op());
 
@@ -440,7 +440,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
       [=] __device__(value_t input) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         bool rectifier = (1 - input) > 0;
-        return sqrt(rectifier * (1 - input));
+        return raft::sqrt(rectifier * (1 - input));
       },
       config_->handle.get_stream());
   }
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index a973aebbab..f67109afbc 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,7 +132,7 @@ class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_id
       this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
-        return sqrt(abs(input) * neg);
+        return raft::sqrt(abs(input) * neg);
       },
       this->config_->handle.get_stream());
   }
@@ -274,7 +274,7 @@ class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
       out_dists,
       out_dists,
       config_->a_nrows * config_->b_nrows,
-      [=] __device__(value_t input) { return sqrt(0.5 * input); },
+      [=] __device__(value_t input) { return raft::sqrt(0.5 * input); },
       config_->handle.get_stream());
   }
 
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index cdc0e62130..3be33820cc 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ namespace spectral {
 namespace detail {
 
 template <typename T>
-void fit_embedding(const raft::handle_t& handle,
+void fit_embedding(raft::device_resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
@@ -88,7 +88,7 @@ void fit_embedding(const raft::handle_t& handle,
     using size_type_t  = index_type;
     using value_type_t = value_type;
 
-    std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+    std::pair<value_type_t, index_type_t> solve(raft::device_resources const& handle,
                                                 size_type_t n_obs_vecs,
                                                 size_type_t dim,
                                                 value_type_t const* __restrict__ obs,
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 358e7d6d29..4ecd447cc4 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -325,7 +325,7 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
+void symmetrize(raft::device_resources const& handle,
                 const value_idx* rows,
                 const value_idx* cols,
                 const value_t* vals,
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 0a97619e87..35d85e893f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #ifndef __SPARSE_SPECTRAL_H
 #define __SPARSE_SPECTRAL_H
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/linalg/detail/spectral.cuh>
 
 namespace raft {
@@ -24,7 +24,7 @@ namespace sparse {
 namespace spectral {
 
 template <typename T>
-void fit_embedding(const raft::handle_t& handle,
+void fit_embedding(raft::device_resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index a01145376a..f34ba4dbd0 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -149,7 +149,7 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
+void symmetrize(raft::device_resources const& handle,
                 const value_idx* rows,
                 const value_idx* cols,
                 const value_t* vals,
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index ae527fe34c..dd5a56bed1 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/linalg/detail/transpose.h>
 
 namespace raft {
@@ -40,7 +40,7 @@ namespace linalg {
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_transpose(const raft::handle_t& handle,
+void csr_transpose(raft::device_resources const& handle,
                    const value_idx* csr_indptr,
                    const value_idx* csr_indices,
                    const value_t* csr_data,
diff --git a/cpp/include/raft/sparse/neighbors/brute_force.cuh b/cpp/include/raft/sparse/neighbors/brute_force.cuh
index 9639ddc24c..515213d250 100644
--- a/cpp/include/raft/sparse/neighbors/brute_force.cuh
+++ b/cpp/include/raft/sparse/neighbors/brute_force.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/neighbors/detail/knn.cuh>
 
@@ -61,7 +61,7 @@ void knn(const value_idx* idxIndptr,
          value_idx* output_indices,
          value_t* output_dists,
          int k,
-         const raft::handle_t& handle,
+         raft::device_resources const& handle,
          size_t batch_size_index             = 2 << 14,  // approx 1M
          size_t batch_size_query             = 2 << 14,
          raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
diff --git a/cpp/include/raft/sparse/neighbors/connect_components.cuh b/cpp/include/raft/sparse/neighbors/connect_components.cuh
index e468643518..90343c1215 100644
--- a/cpp/include/raft/sparse/neighbors/connect_components.cuh
+++ b/cpp/include/raft/sparse/neighbors/connect_components.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/neighbors/detail/connect_components.cuh>
@@ -64,7 +64,7 @@ value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_components(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::sparse::COO<value_t, value_idx>& out,
   const value_t* X,
   const value_idx* orig_colors,
diff --git a/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh b/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
index fea7600723..81259cdaea 100644
--- a/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh
@@ -324,7 +324,7 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_components(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::sparse::COO<value_t, value_idx>& out,
   const value_t* X,
   const value_idx* orig_colors,
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
index 38e67036fe..7bedec9830 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ class sparse_knn_t {
                value_idx* output_indices_,
                value_t* output_dists_,
                int k_,
-               const raft::handle_t& handle_,
+               raft::device_resources const& handle_,
                size_t batch_size_index_             = 2 << 14,  // approx 1M
                size_t batch_size_query_             = 2 << 14,
                raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded,
@@ -422,7 +422,7 @@ class sparse_knn_t {
 
   int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
 
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
 };
 
 };  // namespace raft::sparse::neighbors::detail
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index ffd742f080..d53f2f8df3 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,7 +94,7 @@ void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
+void knn_graph(raft::device_resources const& handle,
                const value_t* X,
                size_t m,
                size_t n,
diff --git a/cpp/include/raft/sparse/neighbors/knn.cuh b/cpp/include/raft/sparse/neighbors/knn.cuh
index 14404adcb4..d5714fbbd1 100644
--- a/cpp/include/raft/sparse/neighbors/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ void brute_force_knn(const value_idx* idxIndptr,
                      value_idx* output_indices,
                      value_t* output_dists,
                      int k,
-                     const raft::handle_t& handle,
+                     raft::device_resources const& handle,
                      size_t batch_size_index             = 2 << 14,  // approx 1M
                      size_t batch_size_query             = 2 << 14,
                      raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
diff --git a/cpp/include/raft/sparse/neighbors/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
index 582df703db..dab4b53482 100644
--- a/cpp/include/raft/sparse/neighbors/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace raft::sparse::neighbors {
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
+void knn_graph(raft::device_resources const& handle,
                const value_t* X,
                std::size_t m,
                std::size_t n,
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 2b2566f107..8cdfa49c45 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,7 +124,7 @@ void compute_duplicates_mask(
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
+void max_duplicates(raft::device_resources const& handle,
                     raft::sparse::COO<value_t, value_idx>& out,
                     const value_idx* rows,
                     const value_idx* cols,
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 488d926fe9..7418b26ec8 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/detail/filter.cuh>
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 80b479f98d..5223100b2a 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/detail/reduce.cuh>
 
@@ -69,7 +69,7 @@ void compute_duplicates_mask(
  * @param[in] n number of columns in COO input matrix
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
+void max_duplicates(raft::device_resources const& handle,
                     raft::sparse::COO<value_t, value_idx>& out,
                     const value_idx* rows,
                     const value_idx* cols,
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index d73d05785d..17e3659355 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #define __SPARSE_ROW_OP_H
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/op/detail/row_op.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh
index 30f7a97ffc..22d3f0168d 100644
--- a/cpp/include/raft/sparse/op/slice.cuh
+++ b/cpp/include/raft/sparse/op/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/op/detail/slice.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index ddb4b2830c..e4e69a93c7 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/op/detail/sort.h>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 49f4e01362..63bc98b404 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <cuda.h>
 #include <curand.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/detail/lapack.hpp>
 #include <raft/spectral/detail/warn_dbg.hpp>
@@ -80,7 +80,7 @@ inline curandStatus_t curandGenerateNormalX(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
+int performLanczosIteration(raft::device_resources const& handle,
                             spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
                             index_type_t* iter,
                             index_type_t maxIter,
@@ -541,7 +541,7 @@ static int francisQRIteration(index_type_t n,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
+static int lanczosRestart(raft::device_resources const& handle,
                           index_type_t n,
                           index_type_t iter,
                           index_type_t iter_new,
@@ -744,7 +744,7 @@ static int lanczosRestart(handle_t const& handle,
  */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -988,7 +988,7 @@ int computeSmallestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1091,7 +1091,7 @@ int computeSmallestEigenvectors(
  */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -1338,7 +1338,7 @@ int computeLargestEigenvectors(
 
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
diff --git a/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
index d68d9f68b0..3ed58ea4ef 100644
--- a/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/solver/detail/mst_solver_inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,17 +60,18 @@ inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::handle_t& handle_,
-                                                                 const edge_t* offsets_,
-                                                                 const vertex_t* indices_,
-                                                                 const weight_t* weights_,
-                                                                 const vertex_t v_,
-                                                                 const edge_t e_,
-                                                                 vertex_t* color_,
-                                                                 cudaStream_t stream_,
-                                                                 bool symmetrize_output_,
-                                                                 bool initialize_colors_,
-                                                                 int iterations_)
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
+  raft::device_resources const& handle_,
+  const edge_t* offsets_,
+  const vertex_t* indices_,
+  const weight_t* weights_,
+  const vertex_t v_,
+  const edge_t e_,
+  vertex_t* color_,
+  cudaStream_t stream_,
+  bool symmetrize_output_,
+  bool initialize_colors_,
+  int iterations_)
   : handle(handle_),
     offsets(offsets_),
     indices(indices_),
diff --git a/cpp/include/raft/sparse/solver/lanczos.cuh b/cpp/include/raft/sparse/solver/lanczos.cuh
index 9b5301988a..cdfaaa97f2 100644
--- a/cpp/include/raft/sparse/solver/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ namespace raft::sparse::solver {
  */
 template <typename index_type_t, typename value_type_t>
 int computeSmallestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
@@ -130,7 +130,7 @@ int computeSmallestEigenvectors(
  */
 template <typename index_type_t, typename value_type_t>
 int computeLargestEigenvectors(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
diff --git a/cpp/include/raft/sparse/solver/mst.cuh b/cpp/include/raft/sparse/solver/mst.cuh
index a941ce7c80..4f7600824a 100644
--- a/cpp/include/raft/sparse/solver/mst.cuh
+++ b/cpp/include/raft/sparse/solver/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace raft::sparse::solver {
  * when an msf is encountered)
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
-Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+Graph_COO<vertex_t, edge_t, weight_t> mst(raft::device_resources const& handle,
                                           edge_t const* offsets,
                                           vertex_t const* indices,
                                           weight_t const* weights,
diff --git a/cpp/include/raft/sparse/solver/mst_solver.cuh b/cpp/include/raft/sparse/solver/mst_solver.cuh
index a10b74d77b..c10d7caf59 100644
--- a/cpp/include/raft/sparse/solver/mst_solver.cuh
+++ b/cpp/include/raft/sparse/solver/mst_solver.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -39,7 +39,7 @@ struct Graph_COO {
 template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 class MST_solver {
  public:
-  MST_solver(const raft::handle_t& handle_,
+  MST_solver(raft::device_resources const& handle_,
              const edge_t* offsets_,
              const vertex_t* indices_,
              const weight_t* weights_,
@@ -56,7 +56,7 @@ class MST_solver {
   ~MST_solver() {}
 
  private:
-  const raft::handle_t& handle;
+  raft::device_resources const& handle;
   cudaStream_t stream;
   bool symmetrize_output, initialize_colors;
   int iterations;
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
index befb5524ac..3d11ffbef4 100644
--- a/cpp/include/raft/spatial/knn/ann.cuh
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ namespace raft::spatial::knn {
  */
 template <typename T = float, typename value_idx = int>
 [[deprecated("Consider using new-style raft::spatial::knn::*::build functions")]] inline void
-approx_knn_build_index(raft::handle_t& handle,
+approx_knn_build_index(raft::device_resources& handle,
                        raft::spatial::knn::knnIndex* index,
                        knnIndexParam* params,
                        raft::distance::DistanceType metric,
@@ -67,7 +67,7 @@ approx_knn_build_index(raft::handle_t& handle,
  */
 template <typename T = float, typename value_idx = int>
 [[deprecated("Consider using new-style raft::spatial::knn::*::search functions")]] inline void
-approx_knn_search(raft::handle_t& handle,
+approx_knn_search(raft::device_resources& handle,
                   float* distances,
                   int64_t* indices,
                   raft::spatial::knn::knnIndex* index,
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index a0d79a1b77..0e9e323b84 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,12 +22,10 @@
 
 #include "detail/processing.hpp"
 #include "ivf_flat_types.hpp"
+#include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/distance/distance_types.hpp>
 
-#include <faiss/gpu/GpuIndex.h>
-#include <raft/spatial/knn/faiss_mr.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -36,13 +34,14 @@ struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
   int nprobe;
-  std::unique_ptr<faiss::gpu::GpuIndex> index;
   std::unique_ptr<MetricProcessor<float>> metric_processor;
+
   std::unique_ptr<const ivf_flat::index<float, int64_t>> ivf_flat_float_;
   std::unique_ptr<const ivf_flat::index<uint8_t, int64_t>> ivf_flat_uint8_t_;
   std::unique_ptr<const ivf_flat::index<int8_t, int64_t>> ivf_flat_int8_t_;
 
-  std::unique_ptr<raft::spatial::knn::RmmGpuResources> gpu_res;
+  std::unique_ptr<const raft::neighbors::ivf_pq::index<int64_t>> ivf_pq;
+
   int device;
 
   template <typename T, typename IdxT>
@@ -70,16 +69,6 @@ inline auto knnIndex::ivf_flat<int8_t, int64_t>()
   return ivf_flat_int8_t_;
 }
 
-enum QuantizerType : unsigned int {
-  QT_8bit,
-  QT_4bit,
-  QT_8bit_uniform,
-  QT_4bit_uniform,
-  QT_fp16,
-  QT_8bit_direct,
-  QT_6bit
-};
-
 struct knnIndexParam {
   virtual ~knnIndexParam() {}
 };
@@ -98,11 +87,6 @@ struct IVFPQParam : IVFParam {
   bool usePrecomputedTables;
 };
 
-struct IVFSQParam : IVFParam {
-  QuantizerType qtype;
-  bool encodeResidual;
-};
-
 inline auto from_legacy_index_params(const IVFFlatParam& legacy,
                                      raft::distance::DistanceType metric,
                                      float metric_arg)
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index fdc2d41161..dda353e1c6 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,14 +34,14 @@
 namespace raft::spatial::knn {
 
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void rbc_build_index(const raft::handle_t& handle,
+void rbc_build_index(raft::device_resources const& handle,
                      BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
 {
   raft::neighbors::ball_cover::build_index(handle, index);
 }
 
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void rbc_all_knn_query(const raft::handle_t& handle,
+void rbc_all_knn_query(raft::device_resources const& handle,
                        BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
                        int_t k,
                        idx_t* inds,
@@ -54,7 +54,7 @@ void rbc_all_knn_query(const raft::handle_t& handle,
 }
 
 template <typename idx_t, typename value_t, typename int_t>
-void rbc_knn_query(const raft::handle_t& handle,
+void rbc_knn_query(raft::device_resources const& handle,
                    const BallCoverIndex<idx_t, value_t, int_t>& index,
                    int_t k,
                    const value_t* query,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
deleted file mode 100644
index 961cc76381..0000000000
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ /dev/null
@@ -1,1085 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "ann_utils.cuh"
-
-#include <thrust/gather.h>
-#include <thrust/transform.h>
-
-#include <raft/cluster/detail/kmeans_common.cuh>
-#include <raft/common/nvtx.hpp>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/distance/distance.cuh>
-#include <raft/distance/distance_types.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/normalize.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/argmin.cuh>
-#include <raft/matrix/matrix.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::spatial::knn::detail::kmeans {
-
-constexpr static inline const float kAdjustCentersWeight = 7.0f;
-
-/**
- * @brief Predict labels for the dataset; floats only.
- *
- * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows
- * * n_cluster * sizeof(float)).
- *
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
- * @param n_clusters number of clusters/centers
- * @param dim dimensionality of the data
- * @param[in] dataset a pointer to the data [n_rows, dim]
- * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows]
- * @param n_rows number samples in the `dataset`
- * @param[out] labels output predictions [n_rows]
- * @param metric
- * @param stream
- * @param mr (optional) memory resource to use for temporary allocations
- */
-template <typename IdxT, typename LabelT>
-inline void predict_float_core(const handle_t& handle,
-                               const float* centers,
-                               uint32_t n_clusters,
-                               uint32_t dim,
-                               const float* dataset,
-                               const float* dataset_norm,
-                               IdxT n_rows,
-                               LabelT* labels,
-                               raft::distance::DistanceType metric,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-    case raft::distance::DistanceType::L2SqrtExpanded: {
-      auto workspace = raft::make_device_mdarray<char, IdxT>(
-        handle, mr, make_extents<IdxT>((sizeof(int)) * n_rows));
-
-      auto minClusterAndDistance = raft::make_device_mdarray<raft::KeyValuePair<IdxT, float>, IdxT>(
-        handle, mr, make_extents<IdxT>(n_rows));
-      raft::KeyValuePair<IdxT, float> initial_value(0, std::numeric_limits<float>::max());
-      thrust::fill(handle.get_thrust_policy(),
-                   minClusterAndDistance.data_handle(),
-                   minClusterAndDistance.data_handle() + minClusterAndDistance.size(),
-                   initial_value);
-
-      auto centroidsNorm =
-        raft::make_device_mdarray<float, uint32_t>(handle, mr, make_extents<uint32_t>(n_clusters));
-      raft::linalg::rowNorm<float, IdxT>(
-        centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream);
-
-      raft::distance::fusedL2NNMinReduce<float, raft::KeyValuePair<IdxT, float>, IdxT>(
-        minClusterAndDistance.data_handle(),
-        dataset,
-        centers,
-        dataset_norm,
-        centroidsNorm.data_handle(),
-        n_rows,
-        n_clusters,
-        dim,
-        (void*)workspace.data_handle(),
-        (metric == raft::distance::DistanceType::L2Expanded) ? false : true,
-        false,
-        stream);
-
-      // todo(lsugy): use KVP + iterator in caller.
-      // Copy keys to output labels
-      thrust::transform(handle.get_thrust_policy(),
-                        minClusterAndDistance.data_handle(),
-                        minClusterAndDistance.data_handle() + n_rows,
-                        labels,
-                        [=] __device__(raft::KeyValuePair<IdxT, float> kvp) {
-                          return static_cast<LabelT>(kvp.key);
-                        });
-      break;
-    }
-    case raft::distance::DistanceType::InnerProduct: {
-      // TODO: pass buffer
-      rmm::device_uvector<float> distances(n_rows * n_clusters, stream, mr);
-
-      float alpha = -1.0;
-      float beta  = 0.0;
-
-      linalg::gemm(handle,
-                   true,
-                   false,
-                   n_clusters,
-                   n_rows,
-                   dim,
-                   &alpha,
-                   centers,
-                   dim,
-                   dataset,
-                   dim,
-                   &beta,
-                   distances.data(),
-                   n_clusters,
-                   stream);
-
-      auto distances_const_view = raft::make_device_matrix_view<const float, IdxT, row_major>(
-        distances.data(), n_rows, static_cast<IdxT>(n_clusters));
-      auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(labels, n_rows);
-      raft::matrix::argmin(handle, distances_const_view, labels_view);
-      break;
-    }
-    default: {
-      RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
-    }
-  }
-}
-
-/**
- * @brief Suggest a minibatch size for kmeans prediction.
- *
- * This function is used as a heuristic to split the work over a large dataset
- * to reduce the size of temporary memory allocations.
- *
- * @param n_clusters number of clusters in kmeans clustering
- * @param n_rows dataset size
- * @return a suggested minibatch size
- */
-template <typename IdxT>
-constexpr inline auto calc_minibatch_size(uint32_t n_clusters,
-                                          IdxT n_rows,
-                                          uint32_t dim,
-                                          raft::distance::DistanceType metric,
-                                          bool is_float) -> IdxT
-{
-  n_clusters = std::max<uint32_t>(1, n_clusters);
-
-  // Estimate memory needs per row (i.e element of the batch).
-  IdxT mem_per_row = 0;
-  /* fusedL2NN only needs one integer per row for a mutex.
-   * Other metrics require storing a distance matrix. */
-  if (metric != raft::distance::DistanceType::L2Expanded &&
-      metric != raft::distance::DistanceType::L2SqrtExpanded) {
-    mem_per_row += sizeof(float) * n_clusters;
-  } else {
-    mem_per_row += sizeof(int);
-  }
-  // If we need to convert to float, space required for the converted batch.
-  if (!is_float) { mem_per_row += sizeof(float) * dim; }
-
-  // Heuristic: calculate the minibatch size in order to use at most 1GB of memory.
-  IdxT minibatch_size = (1 << 30) / mem_per_row;
-  minibatch_size      = 64 * ceildiv(minibatch_size, (IdxT)64);
-  minibatch_size      = std::min<IdxT>(minibatch_size, n_rows);
-  return minibatch_size;
-}
-
-/**
- * @brief Given the data and labels, calculate cluster centers and sizes in one sweep.
- *
- * Let `S_i = {x_k | x_k \in dataset & labels[k] == i}` be the vectors in the dataset with label i.
- *
- * On exit,
- *   `centers_i = (\sum_{x \in S_i} x + w_i * center_i) / (|S_i| + w_i)`,
- *     where  `w_i = reset_counters ?  0 : cluster_size[i]`.
- *
- * In other words, the updated cluster centers are a weighted average of the existing cluster
- * center, and the coordinates of the points labeled with i. _This allows calling this function
- * multiple times with different datasets with the same effect as if calling this function once
- * on the combined dataset_.
- *
- * NB: all pointers must be accessible on the device.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param[inout] centers pointer to the output [n_clusters, dim]
- * @param[inout] cluster_sizes number of rows in each cluster [n_clusters]
- * @param n_clusters number of clusters/centers
- * @param dim dimensionality of the data
- * @param[in] dataset a pointer to the data [n_rows, dim]
- * @param n_rows number samples in the `dataset`
- * @param[in] labels output predictions [n_rows]
- * @param reset_counters whether to clear the output arrays before calculating.
- *    When set to `false`, this function may be used to update existing centers and sizes using
- *    the weighted average principle.
- * @param stream
- * @param mr (optional) memory resource to use for temporary allocations on the device
- */
-template <typename T, typename IdxT, typename LabelT>
-void calc_centers_and_sizes(const handle_t& handle,
-                            float* centers,
-                            uint32_t* cluster_sizes,
-                            uint32_t n_clusters,
-                            uint32_t dim,
-                            const T* dataset,
-                            IdxT n_rows,
-                            const LabelT* labels,
-                            bool reset_counters,
-                            rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr = nullptr)
-{
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
-  if (!reset_counters) {
-    raft::linalg::matrixVectorOp(
-      centers,
-      centers,
-      cluster_sizes,
-      (int64_t)dim,
-      (int64_t)n_clusters,
-      true,
-      false,
-      [=] __device__(float c, uint32_t s) -> float { return c * s; },
-      stream);
-  }
-
-  rmm::device_uvector<char> workspace(0, stream, mr);
-
-  // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
-  // If we don't reset, we compute in a temporary buffer and add in a separate step.
-  rmm::device_uvector<uint32_t> temp_cluster_sizes(0, stream, mr);
-  uint32_t* temp_sizes = cluster_sizes;
-  if (!reset_counters) {
-    temp_cluster_sizes.resize(n_clusters, stream);
-    temp_sizes = temp_cluster_sizes.data();
-  }
-
-  utils::mapping<float> mapping_op;
-  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(dataset,
-                                                                                  mapping_op);
-
-  // todo(lsugy): use iterator from KV output of fusedL2NN
-  raft::linalg::reduce_rows_by_key(mapping_itr,
-                                   static_cast<int64_t>(dim),
-                                   labels,
-                                   nullptr,
-                                   static_cast<int64_t>(n_rows),
-                                   static_cast<int64_t>(dim),
-                                   static_cast<int64_t>(n_clusters),
-                                   centers,
-                                   stream,
-                                   reset_counters);
-
-  // Compute weight of each cluster
-  raft::cluster::detail::countLabels(handle,
-                                     labels,
-                                     temp_sizes,
-                                     static_cast<int64_t>(n_rows),
-                                     static_cast<int64_t>(n_clusters),
-                                     workspace);
-
-  // Add previous sizes if necessary
-  if (!reset_counters) {
-    raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream);
-  }
-
-  raft::linalg::matrixVectorOp(
-    centers,
-    centers,
-    cluster_sizes,
-    static_cast<int64_t>(dim),
-    static_cast<int64_t>(n_clusters),
-    true,
-    false,
-    [=] __device__(float mat, uint32_t vec) {
-      if (vec == 0u)
-        return 0.0f;
-      else
-        return mat / vec;
-    },
-    stream);
-}
-
-/** Computes the L2 norm of the dataset, converting to float if necessary */
-template <typename T, typename IdxT>
-void compute_norm(float* dataset_norm,
-                  const T* dataset,
-                  IdxT dim,
-                  IdxT n_rows,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans::compute_norm");
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-  rmm::device_uvector<float> dataset_float(0, stream, mr);
-
-  const float* dataset_ptr = nullptr;
-
-  if (std::is_same_v<float, T>) {
-    dataset_ptr = reinterpret_cast<const float*>(dataset);
-  } else {
-    dataset_float.resize(n_rows * dim, stream);
-
-    linalg::unaryOp(dataset_float.data(), dataset, n_rows * dim, utils::mapping<float>{}, stream);
-
-    dataset_ptr = (const float*)dataset_float.data();
-  }
-
-  raft::linalg::rowNorm<float, IdxT>(
-    dataset_norm, dataset_ptr, dim, n_rows, raft::linalg::L2Norm, true, stream);
-}
-
-/**
- * @brief Predict labels for the dataset.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param[in] centers a pointer to the row-major matrix of cluster centers [n_clusters, dim]
- * @param n_clusters number of clusters/centers
- * @param dim dimensionality of the data
- * @param[in] dataset a pointer to the data [n_rows, dim]
- * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows]
- * @param n_rows number samples in the `dataset`
- * @param[out] labels output predictions [n_rows]
- * @param metric
- * @param stream
- * @param mr (optional) memory resource to use for temporary allocations
- */
-template <typename T, typename IdxT, typename LabelT>
-void predict(const handle_t& handle,
-             const float* centers,
-             uint32_t n_clusters,
-             uint32_t dim,
-             const T* dataset,
-             IdxT n_rows,
-             LabelT* labels,
-             raft::distance::DistanceType metric,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr = nullptr,
-             const float* dataset_norm           = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-  IdxT max_minibatch_size =
-    calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v<T, float>);
-  rmm::device_uvector<float> cur_dataset(
-    std::is_same_v<T, float> ? 0 : max_minibatch_size * dim, stream, mr);
-  bool need_compute_norm =
-    dataset_norm == nullptr && (metric == raft::distance::DistanceType::L2Expanded ||
-                                metric == raft::distance::DistanceType::L2SqrtExpanded);
-  rmm::device_uvector<float> cur_dataset_norm(
-    need_compute_norm ? max_minibatch_size : 0, stream, mr);
-  const float* dataset_norm_ptr = nullptr;
-  auto cur_dataset_ptr          = cur_dataset.data();
-  for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
-    IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
-
-    if constexpr (std::is_same_v<T, float>) {
-      cur_dataset_ptr = const_cast<float*>(dataset + offset * dim);
-    } else {
-      linalg::unaryOp(cur_dataset_ptr,
-                      dataset + offset * dim,
-                      (IdxT)(minibatch_size * dim),
-                      utils::mapping<float>{},
-                      stream);
-    }
-
-    // Compute the norm now if it hasn't been pre-computed.
-    if (need_compute_norm) {
-      compute_norm<float, IdxT>(
-        cur_dataset_norm.data(), cur_dataset_ptr, (IdxT)dim, (IdxT)minibatch_size, stream, mr);
-      dataset_norm_ptr = cur_dataset_norm.data();
-    } else if (dataset_norm != nullptr) {
-      dataset_norm_ptr = dataset_norm + offset;
-    }
-
-    predict_float_core<IdxT, LabelT>(handle,
-                                     centers,
-                                     n_clusters,
-                                     dim,
-                                     cur_dataset_ptr,
-                                     dataset_norm_ptr,
-                                     minibatch_size,
-                                     labels + offset,
-                                     metric,
-                                     stream,
-                                     mr);
-  }
-}
-
-template <typename T, uint32_t BlockDimY, typename IdxT, typename LabelT>
-__global__ void __launch_bounds__((WarpSize * BlockDimY))
-  adjust_centers_kernel(float* centers,  // [n_clusters, dim]
-                        uint32_t n_clusters,
-                        uint32_t dim,
-                        const T* dataset,  // [n_rows, dim]
-                        IdxT n_rows,
-                        const LabelT* labels,           // [n_rows]
-                        const uint32_t* cluster_sizes,  // [n_clusters]
-                        float threshold,
-                        uint32_t average,
-                        uint32_t seed,
-                        uint32_t* count)
-{
-  uint32_t l = threadIdx.y + BlockDimY * blockIdx.y;
-  if (l >= n_clusters) return;
-  auto csize = cluster_sizes[l];
-  // skip big clusters
-  if (csize > static_cast<uint32_t>(average * threshold)) return;
-
-  // choose a "random" i that belongs to a rather large cluster
-  IdxT i;
-  uint32_t j = laneId();
-  if (j == 0) {
-    do {
-      auto old = static_cast<IdxT>(atomicAdd(count, 1));
-      i        = (seed * (old + 1)) % n_rows;
-    } while (cluster_sizes[labels[i]] < average);
-  }
-  i = raft::shfl(i, 0);
-
-  // Adjust the center of the selected smaller cluster to gravitate towards
-  // a sample from the selected larger cluster.
-  const IdxT li = static_cast<IdxT>(labels[i]);
-  // Weight of the current center for the weighted average.
-  // We dump it for anomalously small clusters, but keep constant otherwise.
-  const float wc = csize > kAdjustCentersWeight ? kAdjustCentersWeight : float(csize);
-  // Weight for the datapoint used to shift the center.
-  const float wd = 1.0;
-  for (; j < dim; j += WarpSize) {
-    float val = 0;
-    val += wc * centers[j + dim * li];
-    val += wd * utils::mapping<float>{}(dataset[j + static_cast<IdxT>(dim) * i]);
-    val /= wc + wd;
-    centers[j + dim * l] = val;
-  }
-}
-
-/**
- * @brief Adjust centers for clusters that have small number of entries.
- *
- * For each cluster, where the cluster size is not bigger than a threshold, the center is moved
- * towards a data point that belongs to a large cluster.
- *
- * NB: if this function returns `true`, you should update the labels.
- *
- * NB: all pointers are used either on the host side or on the device side together.
- *
- * @tparam T element type
- *
- * @param[inout] centers cluster centers [n_clusters, dim]
- * @param n_clusters number of rows in `centers`
- * @param dim number of columns in `centers` and `dataset`
- * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim]
- * @param n_rows number of rows in `dataset`
- * @param[in] labels a host pointer to the cluster indices [n_rows]
- * @param[in] cluster_sizes number of rows in each cluster [n_clusters]
- * @param threshold defines a criterion for adjusting a cluster
- *                   (cluster_sizes <= average_size * threshold)
- *                   0 <= threshold < 1
- * @param device_memory  memory resource to use for temporary allocations
- * @param stream
- *
- * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated).
- */
-template <typename T, typename IdxT, typename LabelT>
-auto adjust_centers(float* centers,
-                    uint32_t n_clusters,
-                    uint32_t dim,
-                    const T* dataset,
-                    IdxT n_rows,
-                    const LabelT* labels,
-                    const uint32_t* cluster_sizes,
-                    float threshold,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* device_memory) -> bool
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  if (n_clusters == 0) { return false; }
-  constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
-                                      601,  659,  733,  809,  863,  941,  1013, 1069, 1151, 1223,
-                                      1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987,
-                                      2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741};
-  static IdxT i        = 0;
-  static IdxT i_primes = 0;
-
-  bool adjusted    = false;
-  uint32_t average = static_cast<uint32_t>(n_rows / static_cast<IdxT>(n_clusters));
-  uint32_t ofst;
-  do {
-    i_primes = (i_primes + 1) % kPrimes.size();
-    ofst     = kPrimes[i_primes];
-  } while (n_rows % ofst == 0);
-
-  switch (utils::check_pointer_residency(centers, dataset, labels, cluster_sizes)) {
-    case utils::pointer_residency::host_and_device:
-    case utils::pointer_residency::device_only: {
-      constexpr uint32_t kBlockDimY = 4;
-      const dim3 block_dim(WarpSize, kBlockDimY, 1);
-      const dim3 grid_dim(1, raft::ceildiv(n_clusters, kBlockDimY), 1);
-      rmm::device_scalar<uint32_t> update_count(0, stream, device_memory);
-      adjust_centers_kernel<T, kBlockDimY><<<grid_dim, block_dim, 0, stream>>>(centers,
-                                                                               n_clusters,
-                                                                               dim,
-                                                                               dataset,
-                                                                               n_rows,
-                                                                               labels,
-                                                                               cluster_sizes,
-                                                                               threshold,
-                                                                               average,
-                                                                               ofst,
-                                                                               update_count.data());
-      adjusted = update_count.value(stream) > 0;  // NB: rmm scalar performs the sync
-    } break;
-    case utils::pointer_residency::host_only: {
-      stream.synchronize();
-      for (uint32_t l = 0; l < n_clusters; l++) {
-        auto csize = cluster_sizes[l];
-        // skip big clusters
-        if (csize > static_cast<uint32_t>(average * threshold)) continue;
-        // choose a "random" i that belongs to a rather large cluster
-        do {
-          i = (i + ofst) % n_rows;
-        } while (cluster_sizes[labels[i]] < average);
-        // Adjust the center of the selected smaller cluster to gravitate towards
-        // a sample from the selected larger cluster.
-        const IdxT li = static_cast<IdxT>(labels[i]);
-        // Weight of the current center for the weighted average.
-        // We dump it for anomalously small clusters, but keep constant otherwise.
-        const float wc = std::min<float>(csize, kAdjustCentersWeight);
-        // Weight for the datapoint used to shift the center.
-        const float wd = 1.0;
-        for (uint32_t j = 0; j < dim; j++) {
-          float val = 0;
-          val += wc * centers[j + dim * li];
-          val += wd * utils::mapping<float>{}(dataset[j + static_cast<IdxT>(dim) * i]);
-          val /= wc + wd;
-          centers[j + dim * l] = val;
-        }
-        adjusted = true;
-      }
-      stream.synchronize();
-    } break;
-    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
-  }
-  return adjusted;
-}
-
-/**
- * @brief Expectation-maximization-balancing combined in an iterative process.
- *
- * Note, the `cluster_centers` is assumed to be already initialized here.
- * Thus, this function can be used for fine-tuning existing clusters;
- * to train from scratch, use `build_clusters` function below.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param n_iters the requested number of iteration
- * @param dim the dimensionality of the dataset
- * @param[in] dataset a pointer to a managed row-major array [n_rows, dim]
- * @param[in] dataset_norm pointer to the precomputed norm (for L2 metrics only) [n_rows]
- * @param n_rows the number of rows in the dataset
- * @param n_cluster the requested number of clusters
- * @param[inout] cluster_centers a pointer to a managed row-major array [n_clusters, dim]
- * @param[out] cluster_labels a pointer to a managed row-major array [n_rows]
- * @param[out] cluster_sizes a pointer to a managed row-major array [n_clusters]
- * @param metric the distance type (there is a tweak in place for the similarity-based metrics)
- * @param balancing_pullback
- *   if the cluster centers are rebalanced on this number of iterations,
- *   one extra iteration is performed (this could happen several times) (default should be `2`).
- *   In other words, the first and then every `ballancing_pullback`-th rebalancing operation adds
- *   one more iteration to the main cycle.
- * @param balancing_threshold
- *   the rebalancing takes place if any cluster is smaller than `avg_size * balancing_threshold`
- *   on a given iteration (default should be `~ 0.25`).
- * @param stream
- * @param device_memory
- *   a memory resource for device allocations (makes sense to provide a memory pool here)
- */
-template <typename T, typename IdxT, typename LabelT>
-void balancing_em_iters(const handle_t& handle,
-                        uint32_t n_iters,
-                        uint32_t dim,
-                        const T* dataset,
-                        const float* dataset_norm,
-                        IdxT n_rows,
-                        uint32_t n_clusters,
-                        float* cluster_centers,
-                        LabelT* cluster_labels,
-                        uint32_t* cluster_sizes,
-                        raft::distance::DistanceType metric,
-                        uint32_t balancing_pullback,
-                        float balancing_threshold,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* device_memory)
-{
-  uint32_t balancing_counter = balancing_pullback;
-  for (uint32_t iter = 0; iter < n_iters; iter++) {
-    // Balancing step - move the centers around to equalize cluster sizes
-    // (but not on the first iteration)
-    if (iter > 0 && kmeans::adjust_centers(cluster_centers,
-                                           n_clusters,
-                                           dim,
-                                           dataset,
-                                           n_rows,
-                                           cluster_labels,
-                                           cluster_sizes,
-                                           balancing_threshold,
-                                           stream,
-                                           device_memory)) {
-      if (balancing_counter++ >= balancing_pullback) {
-        balancing_counter -= balancing_pullback;
-        n_iters++;
-      }
-    }
-    switch (metric) {
-      // For some metrics, cluster calculation and adjustment tends to favor zero center vectors.
-      // To avoid converging to zero, we normalize the center vectors on every iteration.
-      case raft::distance::DistanceType::InnerProduct:
-      case raft::distance::DistanceType::CosineExpanded:
-      case raft::distance::DistanceType::CorrelationExpanded: {
-        auto clusters_in_view =
-          raft::make_device_matrix_view<const float, uint32_t, raft::row_major>(
-            cluster_centers, n_clusters, dim);
-        auto clusters_out_view = raft::make_device_matrix_view<float, uint32_t, raft::row_major>(
-          cluster_centers, n_clusters, dim);
-        raft::linalg::row_normalize(
-          handle, clusters_in_view, clusters_out_view, raft::linalg::L2Norm);
-        break;
-      }
-      default: break;
-    }
-    // E: Expectation step - predict labels
-    predict<T, IdxT, LabelT>(handle,
-                             cluster_centers,
-                             n_clusters,
-                             dim,
-                             dataset,
-                             n_rows,
-                             cluster_labels,
-                             metric,
-                             stream,
-                             device_memory,
-                             dataset_norm);
-    // M: Maximization step - calculate optimal cluster centers
-    calc_centers_and_sizes(handle,
-                           cluster_centers,
-                           cluster_sizes,
-                           n_clusters,
-                           dim,
-                           dataset,
-                           n_rows,
-                           cluster_labels,
-                           true,
-                           stream,
-                           device_memory);
-  }
-}
-
-/** Randomly initialize cluster centers and then call `balancing_em_iters`. */
-template <typename T, typename IdxT, typename LabelT>
-void build_clusters(const handle_t& handle,
-                    uint32_t n_iters,
-                    uint32_t dim,
-                    const T* dataset,
-                    IdxT n_rows,
-                    uint32_t n_clusters,
-                    float* cluster_centers,
-                    LabelT* cluster_labels,
-                    uint32_t* cluster_sizes,
-                    raft::distance::DistanceType metric,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* device_memory,
-                    const float* dataset_norm = nullptr)
-{
-  RAFT_EXPECTS(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(dim) <=
-                 static_cast<uint64_t>(std::numeric_limits<IdxT>::max()),
-               "the chosen index type cannot represent all indices for the given dataset");
-
-  // "randomly initialize labels"
-  auto f = [n_clusters] __device__(LabelT * out, IdxT i) {
-    *out = LabelT(i % static_cast<IdxT>(n_clusters));
-  };
-  linalg::writeOnlyUnaryOp<LabelT, decltype(f), IdxT>(cluster_labels, n_rows, f, stream);
-
-  // update centers to match the initialized labels.
-  calc_centers_and_sizes(handle,
-                         cluster_centers,
-                         cluster_sizes,
-                         n_clusters,
-                         dim,
-                         dataset,
-                         n_rows,
-                         cluster_labels,
-                         true,
-                         stream,
-                         device_memory);
-
-  // run EM
-  balancing_em_iters<T, IdxT, LabelT>(handle,
-                                      n_iters,
-                                      dim,
-                                      dataset,
-                                      dataset_norm,
-                                      n_rows,
-                                      n_clusters,
-                                      cluster_centers,
-                                      cluster_labels,
-                                      cluster_sizes,
-                                      metric,
-                                      2,
-                                      0.25f,
-                                      stream,
-                                      device_memory);
-}
-
-/** Calculate how many fine clusters should belong to each mesocluster. */
-template <typename IdxT>
-inline auto arrange_fine_clusters(uint32_t n_clusters,
-                                  uint32_t n_mesoclusters,
-                                  IdxT n_rows,
-                                  const uint32_t* mesocluster_sizes)
-{
-  std::vector<uint32_t> fine_clusters_nums(n_mesoclusters);
-  std::vector<uint32_t> fine_clusters_csum(n_mesoclusters + 1);
-  fine_clusters_csum[0] = 0;
-
-  uint32_t n_lists_rem       = n_clusters;
-  uint32_t n_nonempty_ms_rem = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    n_nonempty_ms_rem += mesocluster_sizes[i] > 0 ? 1 : 0;
-  }
-  IdxT n_rows_rem                 = n_rows;
-  IdxT mesocluster_size_sum       = 0;
-  uint32_t mesocluster_size_max   = 0;
-  uint32_t fine_clusters_nums_max = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    if (i < n_mesoclusters - 1) {
-      // Although the algorithm is meant to produce balanced clusters, when something
-      // goes wrong, we may get empty clusters (e.g. during development/debugging).
-      // The code below ensures a proportional arrangement of fine cluster numbers
-      // per mesocluster, even if some clusters are empty.
-      if (mesocluster_sizes[i] == 0) {
-        fine_clusters_nums[i] = 0;
-      } else {
-        n_nonempty_ms_rem--;
-        auto s = uint32_t((double)n_lists_rem * mesocluster_sizes[i] / n_rows_rem + .5);
-        s      = std::min<uint32_t>(s, n_lists_rem - n_nonempty_ms_rem);
-        fine_clusters_nums[i] = std::max<uint32_t>(s, 1);
-      }
-    } else {
-      fine_clusters_nums[i] = n_lists_rem;
-    }
-    n_lists_rem -= fine_clusters_nums[i];
-    n_rows_rem -= mesocluster_sizes[i];
-    mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]);
-    mesocluster_size_sum += mesocluster_sizes[i];
-    fine_clusters_nums_max    = max(fine_clusters_nums_max, fine_clusters_nums[i]);
-    fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i];
-  }
-
-  RAFT_EXPECTS(mesocluster_size_sum == n_rows,
-               "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)",
-               static_cast<size_t>(mesocluster_size_sum),
-               static_cast<size_t>(n_rows));
-  RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters,
-               "fine cluster numbers do not add up (%u) to the total number of clusters (%u)",
-               fine_clusters_csum[n_mesoclusters],
-               n_clusters);
-
-  return std::make_tuple(mesocluster_size_max,
-                         fine_clusters_nums_max,
-                         std::move(fine_clusters_nums),
-                         std::move(fine_clusters_csum));
-}
-
-/**
- *  Given the (coarse) mesoclusters and the distribution of fine clusters within them,
- *  build the fine clusters.
- *
- *  Processing one mesocluster at a time:
- *   1. Copy mesocluster data into a separate buffer
- *   2. Predict fine cluster
- *   3. Refince the fine cluster centers
- *
- *  As a result, the fine clusters are what is returned by `build_hierarchical`;
- *  this function returns the total number of fine clusters, which can be checked to be
- *  the same as the requested number of clusters.
- */
-template <typename T, typename IdxT, typename LabelT>
-auto build_fine_clusters(const handle_t& handle,
-                         uint32_t n_iters,
-                         uint32_t dim,
-                         const T* dataset_mptr,
-                         const float* dataset_norm_mptr,
-                         const LabelT* labels_mptr,
-                         IdxT n_rows,
-                         const uint32_t* fine_clusters_nums,
-                         const uint32_t* fine_clusters_csum,
-                         const uint32_t* mesocluster_sizes,
-                         uint32_t n_mesoclusters,
-                         uint32_t mesocluster_size_max,
-                         uint32_t fine_clusters_nums_max,
-                         float* cluster_centers,
-                         raft::distance::DistanceType metric,
-                         rmm::mr::device_memory_resource* managed_memory,
-                         rmm::mr::device_memory_resource* device_memory,
-                         rmm::cuda_stream_view stream) -> uint32_t
-{
-  rmm::device_uvector<IdxT> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
-  rmm::device_uvector<float> mc_trainset_buf(mesocluster_size_max * dim, stream, device_memory);
-  rmm::device_uvector<float> mc_trainset_norm_buf(mesocluster_size_max, stream, device_memory);
-  auto mc_trainset_ids  = mc_trainset_ids_buf.data();
-  auto mc_trainset      = mc_trainset_buf.data();
-  auto mc_trainset_norm = mc_trainset_norm_buf.data();
-
-  // label (cluster ID) of each vector
-  rmm::device_uvector<LabelT> mc_trainset_labels(mesocluster_size_max, stream, device_memory);
-
-  rmm::device_uvector<float> mc_trainset_ccenters(
-    fine_clusters_nums_max * dim, stream, device_memory);
-  // number of vectors in each cluster
-  rmm::device_uvector<uint32_t> mc_trainset_csizes_tmp(
-    fine_clusters_nums_max, stream, device_memory);
-
-  // Training clusters in each meso-cluster
-  uint32_t n_clusters_done = 0;
-  for (uint32_t i = 0; i < n_mesoclusters; i++) {
-    uint32_t k = 0;
-    for (IdxT j = 0; j < n_rows; j++) {
-      if (labels_mptr[j] == (LabelT)i) { mc_trainset_ids[k++] = j; }
-    }
-    if (k != mesocluster_sizes[i])
-      RAFT_LOG_WARN("Incorrect mesocluster size at %d. %d vs %d", i, k, mesocluster_sizes[i]);
-    if (k == 0) {
-      RAFT_LOG_DEBUG("Empty cluster %d", i);
-      RAFT_EXPECTS(fine_clusters_nums[i] == 0,
-                   "Number of fine clusters must be zero for the empty mesocluster (got %d)",
-                   fine_clusters_nums[i]);
-      continue;
-    } else {
-      RAFT_EXPECTS(fine_clusters_nums[i] > 0,
-                   "Number of fine clusters must be non-zero for a non-empty mesocluster");
-    }
-
-    utils::copy_selected((IdxT)mesocluster_sizes[i],
-                         (IdxT)dim,
-                         dataset_mptr,
-                         mc_trainset_ids,
-                         (IdxT)dim,
-                         mc_trainset,
-                         (IdxT)dim,
-                         stream);
-    if (metric == raft::distance::DistanceType::L2Expanded ||
-        metric == raft::distance::DistanceType::L2SqrtExpanded) {
-      thrust::gather(handle.get_thrust_policy(),
-                     mc_trainset_ids,
-                     mc_trainset_ids + mesocluster_sizes[i],
-                     dataset_norm_mptr,
-                     mc_trainset_norm);
-    }
-
-    build_clusters<float, IdxT, LabelT>(handle,
-                                        n_iters,
-                                        dim,
-                                        mc_trainset,
-                                        mesocluster_sizes[i],
-                                        fine_clusters_nums[i],
-                                        mc_trainset_ccenters.data(),
-                                        mc_trainset_labels.data(),
-                                        mc_trainset_csizes_tmp.data(),
-                                        metric,
-                                        stream,
-                                        device_memory,
-                                        mc_trainset_norm);
-
-    raft::copy(cluster_centers + (dim * fine_clusters_csum[i]),
-               mc_trainset_ccenters.data(),
-               fine_clusters_nums[i] * dim,
-               stream);
-    handle.sync_stream(stream);
-    n_clusters_done += fine_clusters_nums[i];
-  }
-  return n_clusters_done;
-}
-
-/**
- * @brief Hierarchical balanced k-means
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param handle
- * @param n_iters number of training iterations
- * @param dim number of columns in `centers` and `dataset`
- * @param[in] dataset a device pointer to the source dataset [n_rows, dim]
- * @param n_rows number of rows in the input
- * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim]
- * @param n_cluster
- * @param metric the distance type
- * @param stream
- */
-template <typename T, typename IdxT>
-void build_hierarchical(const handle_t& handle,
-                        uint32_t n_iters,
-                        uint32_t dim,
-                        const T* dataset,
-                        IdxT n_rows,
-                        float* cluster_centers,
-                        uint32_t n_clusters,
-                        raft::distance::DistanceType metric,
-                        rmm::cuda_stream_view stream)
-{
-  using LabelT = uint32_t;
-
-  RAFT_EXPECTS(static_cast<uint64_t>(n_rows) * static_cast<uint64_t>(dim) <=
-                 static_cast<uint64_t>(std::numeric_limits<IdxT>::max()),
-               "the chosen index type cannot represent all indices for the given dataset");
-
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "kmeans::build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-
-  uint32_t n_mesoclusters = std::min<uint32_t>(n_clusters, std::sqrt(n_clusters) + 0.5);
-  RAFT_LOG_DEBUG("kmeans::build_hierarchical: n_mesoclusters: %u", n_mesoclusters);
-
-  rmm::mr::managed_memory_resource managed_memory;
-  rmm::mr::device_memory_resource* device_memory = nullptr;
-  IdxT max_minibatch_size =
-    calc_minibatch_size(n_clusters, n_rows, dim, metric, std::is_same_v<T, float>);
-  auto pool_guard = raft::get_pool_memory_resource(device_memory, max_minibatch_size * dim * 4);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG(
-      "kmeans::build_hierarchical: using pool memory resource with initial size %zu bytes",
-      pool_guard->pool_size());
-  }
-
-  // Precompute the L2 norm of the dataset if relevant.
-  const float* dataset_norm = nullptr;
-  rmm::device_uvector<float> dataset_norm_buf(0, stream, device_memory);
-  if (metric == raft::distance::DistanceType::L2Expanded ||
-      metric == raft::distance::DistanceType::L2SqrtExpanded) {
-    dataset_norm_buf.resize(n_rows, stream);
-    for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
-      IdxT minibatch_size = std::min<IdxT>(max_minibatch_size, n_rows - offset);
-      compute_norm<T, IdxT>(dataset_norm_buf.data() + offset,
-                            dataset + dim * offset,
-                            (IdxT)dim,
-                            (IdxT)minibatch_size,
-                            stream,
-                            device_memory);
-    }
-    dataset_norm = (const float*)dataset_norm_buf.data();
-  }
-
-  // build coarse clusters (mesoclusters)
-  rmm::device_uvector<LabelT> mesocluster_labels_buf(n_rows, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
-  {
-    rmm::device_uvector<float> mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory);
-    build_clusters<T, IdxT, LabelT>(handle,
-                                    n_iters,
-                                    dim,
-                                    dataset,
-                                    n_rows,
-                                    n_mesoclusters,
-                                    mesocluster_centers_buf.data(),
-                                    mesocluster_labels_buf.data(),
-                                    mesocluster_sizes_buf.data(),
-                                    metric,
-                                    stream,
-                                    device_memory,
-                                    dataset_norm);
-  }
-
-  auto mesocluster_sizes  = mesocluster_sizes_buf.data();
-  auto mesocluster_labels = mesocluster_labels_buf.data();
-
-  handle.sync_stream(stream);
-
-  // build fine clusters
-  auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] =
-    arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes);
-
-  if (mesocluster_size_max * n_mesoclusters > 2 * n_rows) {
-    RAFT_LOG_WARN("build_hierarchical: built unbalanced mesoclusters");
-    RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters);
-    RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters);
-  }
-
-  auto n_clusters_done = build_fine_clusters<T, IdxT, LabelT>(handle,
-                                                              n_iters,
-                                                              dim,
-                                                              dataset,
-                                                              dataset_norm,
-                                                              mesocluster_labels,
-                                                              n_rows,
-                                                              fine_clusters_nums.data(),
-                                                              fine_clusters_csum.data(),
-                                                              mesocluster_sizes,
-                                                              n_mesoclusters,
-                                                              mesocluster_size_max,
-                                                              fine_clusters_nums_max,
-                                                              cluster_centers,
-                                                              metric,
-                                                              &managed_memory,
-                                                              device_memory,
-                                                              stream);
-  RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");
-
-  rmm::device_uvector<uint32_t> cluster_sizes(n_clusters, stream, device_memory);
-  rmm::device_uvector<LabelT> labels(n_rows, stream, device_memory);
-
-  // Fine-tuning kmeans for all clusters
-  //
-  // (*) Since the likely cluster centroids have been calculated
-  // hierarchically already, the number of iteration for fine-tuning
-  // kmeans for whole clusters should be reduced. However, there
-  // is a possibility that the clusters could be unbalanced here,
-  // in which case the actual number of iterations would be increased.
-  //
-  balancing_em_iters<T, IdxT, LabelT>(handle,
-                                      std::max<uint32_t>(n_iters / 10, 2),
-                                      dim,
-                                      dataset,
-                                      dataset_norm,
-                                      n_rows,
-                                      n_clusters,
-                                      cluster_centers,
-                                      labels.data(),
-                                      cluster_sizes.data(),
-                                      metric,
-                                      5,
-                                      0.2f,
-                                      stream,
-                                      device_memory);
-}
-
-}  // namespace raft::spatial::knn::detail::kmeans
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 10f781d817..427e812cda 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,7 @@
 
 #include "../ann_common.h"
 #include "../ivf_flat.cuh"
-#include "knn_brute_force_faiss.cuh"
 
-#include "common_faiss.h"
 #include "processing.cuh"
 #include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -29,73 +27,16 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/label/classlabels.cuh>
-#include <raft/spatial/knn/faiss_mr.hpp>
+#include <raft/neighbors/ivf_pq.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 namespace raft::spatial::knn::detail {
 
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
-{
-  switch (qtype) {
-    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
-    case QuantizerType::QT_8bit_uniform:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
-    case QuantizerType::QT_4bit_uniform:
-      return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
-    case QuantizerType::QT_8bit_direct:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
-  }
-}
-
-template <typename IntType = int>
-void approx_knn_ivfflat_build_index(knnIndex* index,
-                                    const IVFFlatParam& params,
-                                    IntType n,
-                                    IntType D)
-{
-  faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device                  = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
-  index->index.reset(
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res.get(), D, params.nlist, faiss_metric, config));
-}
-
-template <typename IntType = int>
-void approx_knn_ivfpq_build_index(knnIndex* index, const IVFPQParam& params, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device                  = index->device;
-  config.usePrecomputedTables    = params.usePrecomputedTables;
-  config.interleavedLayout       = params.n_bits != 8;
-  faiss::MetricType faiss_metric = build_faiss_metric(index->metric);
-  index->index.reset(new faiss::gpu::GpuIndexIVFPQ(
-    index->gpu_res.get(), D, params.nlist, params.M, params.n_bits, faiss_metric, config));
-}
-
-template <typename IntType = int>
-void approx_knn_ivfsq_build_index(knnIndex* index, const IVFSQParam& params, IntType n, IntType D)
-{
-  faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device                                     = index->device;
-  faiss::MetricType faiss_metric                    = build_faiss_metric(index->metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params.qtype);
-  index->index.reset(new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res.get(), D, params.nlist, faiss_qtype, faiss_metric, params.encodeResidual));
-}
-
 template <typename T = float, typename IntType = int>
-void approx_knn_build_index(const handle_t& handle,
+void approx_knn_build_index(raft::device_resources const& handle,
                             knnIndex* index,
                             knnIndexParam* params,
                             raft::distance::DistanceType metric,
@@ -105,7 +46,6 @@ void approx_knn_build_index(const handle_t& handle,
                             IntType D)
 {
   auto stream      = handle.get_stream();
-  index->index     = nullptr;
   index->metric    = metric;
   index->metricArg = metricArg;
   if (dynamic_cast<const IVFParam*>(params)) {
@@ -113,46 +53,42 @@ void approx_knn_build_index(const handle_t& handle,
   }
   auto ivf_ft_pams = dynamic_cast<IVFFlatParam*>(params);
   auto ivf_pq_pams = dynamic_cast<IVFPQParam*>(params);
-  auto ivf_sq_pams = dynamic_cast<IVFSQParam*>(params);
 
   if constexpr (std::is_same_v<T, float>) {
     index->metric_processor = create_processor<float>(metric, n, D, 0, false, stream);
+    // For cosine/correlation distance, the metric processor translates distance
+    // to inner product via pre/post processing - pass the translated metric to
+    // ANN index
+    if (metric == raft::distance::DistanceType::CosineExpanded ||
+        metric == raft::distance::DistanceType::CorrelationExpanded) {
+      metric = index->metric = raft::distance::DistanceType::InnerProduct;
+    }
   }
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->preprocess(index_array); }
 
-  if (ivf_ft_pams && (metric == raft::distance::DistanceType::L2Unexpanded ||
-                      metric == raft::distance::DistanceType::L2Expanded ||
-                      metric == raft::distance::DistanceType::InnerProduct)) {
+  if (ivf_ft_pams) {
     auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
     index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
       ivf_flat::build(handle, new_params, index_array, int64_t(n), D));
+  } else if (ivf_pq_pams) {
+    neighbors::ivf_pq::index_params params;
+    params.metric     = metric;
+    params.metric_arg = metricArg;
+    params.n_lists    = ivf_pq_pams->nlist;
+    params.pq_bits    = ivf_pq_pams->n_bits;
+    params.pq_dim     = ivf_pq_pams->M;
+    // TODO: handle ivf_pq_pams.usePrecomputedTables ?
+    index->ivf_pq = std::make_unique<const neighbors::ivf_pq::index<int64_t>>(
+      neighbors::ivf_pq::build(handle, params, index_array, int64_t(n), D));
   } else {
-    RAFT_CUDA_TRY(cudaGetDevice(&(index->device)));
-    index->gpu_res.reset(new raft::spatial::knn::RmmGpuResources());
-    index->gpu_res->noTempMemory();
-    index->gpu_res->setDefaultStream(index->device, stream);
-    if (ivf_ft_pams) {
-      approx_knn_ivfflat_build_index(index, *ivf_ft_pams, n, D);
-    } else if (ivf_pq_pams) {
-      approx_knn_ivfpq_build_index(index, *ivf_pq_pams, n, D);
-    } else if (ivf_sq_pams) {
-      approx_knn_ivfsq_build_index(index, *ivf_sq_pams, n, D);
-    } else {
-      RAFT_FAIL("Unrecognized index type.");
-    }
-    if constexpr (std::is_same_v<T, float>) {
-      index->index->train(n, index_array);
-      index->index->add(n, index_array);
-    } else {
-      RAFT_FAIL("FAISS-based index supports only float data.");
-    }
+    RAFT_FAIL("Unrecognized index type.");
   }
 
   if constexpr (std::is_same_v<T, float>) { index->metric_processor->revert(index_array); }
 }
 
 template <typename T = float, typename IntType = int>
-void approx_knn_search(const handle_t& handle,
+void approx_knn_search(raft::device_resources const& handle,
                        float* distances,
                        int64_t* indices,
                        knnIndex* index,
@@ -160,26 +96,22 @@ void approx_knn_search(const handle_t& handle,
                        T* query_array,
                        IntType n)
 {
-  auto faiss_ivf = dynamic_cast<GpuIndexIVF*>(index->index.get());
-  if (faiss_ivf) { faiss_ivf->setNumProbes(index->nprobe); }
-
   if constexpr (std::is_same_v<T, float>) {
     index->metric_processor->preprocess(query_array);
     index->metric_processor->set_num_queries(k);
   }
 
   // search
-  if (faiss_ivf) {
-    if constexpr (std::is_same_v<T, float>) {
-      faiss_ivf->search(n, query_array, k, distances, indices);
-    } else {
-      RAFT_FAIL("FAISS-based index supports only float data.");
-    }
-  } else if (index->ivf_flat<T, int64_t>()) {
+  if (index->ivf_flat<T, int64_t>()) {
     ivf_flat::search_params params;
     params.n_probes = index->nprobe;
     ivf_flat::search(
       handle, params, *(index->ivf_flat<T, int64_t>()), query_array, n, k, indices, distances);
+  } else if (index->ivf_pq) {
+    neighbors::ivf_pq::search_params params;
+    params.n_probes = index->nprobe;
+    neighbors::ivf_pq::search(
+      handle, params, *index->ivf_pq, query_array, n, k, indices, distances);
   } else {
     RAFT_FAIL("The model is not trained");
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ann_serialization.h b/cpp/include/raft/spatial/knn/detail/ann_serialization.h
index cf2aeedcfc..34dc3cf00f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_serialization.h
+++ b/cpp/include/raft/spatial/knn/detail/ann_serialization.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ T read_scalar(std::ifstream& file)
 
 template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
 void write_mdspan(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   std::ofstream& of,
   const raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
 {
@@ -91,7 +91,7 @@ void write_mdspan(
 }
 
 template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
-void read_mdspan(const raft::handle_t& handle,
+void read_mdspan(raft::device_resources const& handle,
                  std::ifstream& file,
                  raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>& obj)
 {
@@ -131,7 +131,7 @@ void read_mdspan(const raft::handle_t& handle,
 }
 
 template <typename ElementType, typename Extents, typename LayoutPolicy, typename AccessorPolicy>
-void read_mdspan(const raft::handle_t& handle,
+void read_mdspan(raft::device_resources const& handle,
                  std::ifstream& file,
                  raft::device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>&& obj)
 {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index b721915187..395714a161 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,19 @@
 
 #pragma once
 
+#include <raft/core/logger.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/integer_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <optional>
 
 namespace raft::spatial::knn::detail::utils {
 
@@ -359,4 +366,202 @@ void copy_selected(IdxT n_rows,
   }
 }
 
+/**
+ * A batch input iterator over the data source.
+ * Given an input pointer, it decides whether the current device has the access to the data and
+ * gives it back to the user in batches. Three scenarios are possible:
+ *
+ *  1. if `source == nullptr`: then `batch.data() == nullptr`
+ *  2. if `source` is accessible from the device, `batch.data()` points directly at the source at
+ *     the proper offsets on each iteration.
+ *  3. if `source` is not accessible from the device, `batch.data()` points to an intermediate
+ *     buffer; the corresponding data is copied in the given `stream` on every iterator dereference
+ *     (i.e. batches can be skipped). Dereferencing the same batch two times in a row does not force
+ *     the copy.
+ *
+ * In all three scenarios, the number of iterations, batch offsets and sizes are the same.
+ *
+ * The iterator can be reused. If the number of iterations is one, at most one copy will ever be
+ * invoked (i.e. small datasets are not reloaded multiple times).
+ */
+template <typename T>
+struct batch_load_iterator {
+  using size_type = size_t;
+
+  /** A single batch of data residing in device memory. */
+  struct batch {
+    /** Logical width of a single row in a batch, in elements of type `T`. */
+    [[nodiscard]] auto row_width() const -> size_type { return row_width_; }
+    /** Logical offset of the batch, in rows (`row_width()`) */
+    [[nodiscard]] auto offset() const -> size_type { return pos_.value_or(0) * batch_size_; }
+    /** Logical size of the batch, in rows (`row_width()`) */
+    [[nodiscard]] auto size() const -> size_type { return batch_len_; }
+    /** Logical size of the batch, in rows (`row_width()`) */
+    [[nodiscard]] auto data() const -> const T* { return const_cast<const T*>(dev_ptr_); }
+    /** Whether this batch copies the data (i.e. the source is inaccessible from the device). */
+    [[nodiscard]] auto does_copy() const -> bool { return needs_copy_; }
+
+   private:
+    batch(const T* source,
+          size_type n_rows,
+          size_type row_width,
+          size_type batch_size,
+          rmm::cuda_stream_view stream,
+          rmm::mr::device_memory_resource* mr)
+      : stream_(stream),
+        buf_(0, stream, mr),
+        source_(source),
+        dev_ptr_(nullptr),
+        n_rows_(n_rows),
+        row_width_(row_width),
+        batch_size_(std::min(batch_size, n_rows)),
+        pos_(std::nullopt),
+        n_iters_(raft::div_rounding_up_safe(n_rows, batch_size)),
+        needs_copy_(false)
+    {
+      if (source_ == nullptr) { return; }
+      cudaPointerAttributes attr;
+      RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, source_));
+      dev_ptr_ = reinterpret_cast<T*>(attr.devicePointer);
+      if (dev_ptr_ == nullptr) {
+        buf_.resize(row_width_ * batch_size_, stream);
+        dev_ptr_    = buf_.data();
+        needs_copy_ = true;
+      }
+    }
+    rmm::cuda_stream_view stream_;
+    rmm::device_uvector<T> buf_;
+    const T* source_;
+    size_type n_rows_;
+    size_type row_width_;
+    size_type batch_size_;
+    size_type n_iters_;
+    bool needs_copy_;
+
+    std::optional<size_type> pos_;
+    size_type batch_len_;
+    T* dev_ptr_;
+
+    friend class batch_load_iterator<T>;
+
+    /**
+     * Changes the state of the batch to point at the `pos` index.
+     * If necessary, copies the data from the source in the registered stream.
+     */
+    void load(const size_type& pos)
+    {
+      // No-op if the data is already loaded, or it's the end of the input.
+      if (pos == pos_ || pos >= n_iters_) { return; }
+      pos_.emplace(pos);
+      batch_len_ = std::min(batch_size_, n_rows_ - std::min(offset(), n_rows_));
+      if (source_ == nullptr) { return; }
+      if (needs_copy_) {
+        if (size() > 0) {
+          RAFT_LOG_DEBUG("batch_load_iterator::copy(offset = %zu, size = %zu, row_width = %zu)",
+                         size_t(offset()),
+                         size_t(size()),
+                         size_t(row_width()));
+          copy(dev_ptr_, source_ + offset() * row_width(), size() * row_width(), stream_);
+        }
+      } else {
+        dev_ptr_ = const_cast<T*>(source_) + offset() * row_width();
+      }
+    }
+  };
+
+  using value_type = batch;
+  using reference  = const value_type&;
+  using pointer    = const value_type*;
+
+  /**
+   * Create a batch iterator over the data `source`.
+   *
+   * For convenience, the data `source` is read in logical units of size `row_width`; batch sizes
+   * and offsets are calculated in logical rows. Hence, can interpret the data as a contiguous
+   * row-major matrix of size [n_rows, row_width], and the batches are the sub-matrices of size
+   * [x<=batch_size, n_rows].
+   *
+   * @param source the input data -- host, device, or nullptr.
+   * @param n_rows the size of the input in logical rows.
+   * @param row_width the size of the logical row in the elements of type `T`.
+   * @param batch_size the desired size of the batch.
+   * @param stream the ordering for the host->device copies, if applicable.
+   * @param mr a custom memory resource for the intermediate buffer, if applicable.
+   */
+  batch_load_iterator(const T* source,
+                      size_type n_rows,
+                      size_type row_width,
+                      size_type batch_size,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0)
+  {
+  }
+  /**
+   * Whether this iterator copies the data on every iteration
+   * (i.e. the source is inaccessible from the device).
+   */
+  [[nodiscard]] auto does_copy() const -> bool { return cur_batch_->does_copy(); }
+  /** Reset the iterator position to `begin()` */
+  void reset() { cur_pos_ = 0; }
+  /** Reset the iterator position to `end()` */
+  void reset_to_end() { cur_pos_ = cur_batch_->n_iters_; }
+  [[nodiscard]] auto begin() const -> const batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    x.reset();
+    return x;
+  }
+  [[nodiscard]] auto end() const -> const batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    x.reset_to_end();
+    return x;
+  }
+  [[nodiscard]] auto operator*() const -> reference
+  {
+    cur_batch_->load(cur_pos_);
+    return *cur_batch_;
+  }
+  [[nodiscard]] auto operator->() const -> pointer
+  {
+    cur_batch_->load(cur_pos_);
+    return cur_batch_.get();
+  }
+  friend auto operator==(const batch_load_iterator<T>& x, const batch_load_iterator<T>& y) -> bool
+  {
+    return x.cur_batch_ == y.cur_batch_ && x.cur_pos_ == y.cur_pos_;
+  };
+  friend auto operator!=(const batch_load_iterator<T>& x, const batch_load_iterator<T>& y) -> bool
+  {
+    return x.cur_batch_ != y.cur_batch_ || x.cur_pos_ != y.cur_pos_;
+  };
+  auto operator++() -> batch_load_iterator<T>&
+  {
+    ++cur_pos_;
+    return *this;
+  }
+  auto operator++(int) -> batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    ++cur_pos_;
+    return x;
+  }
+  auto operator--() -> batch_load_iterator<T>&
+  {
+    --cur_pos_;
+    return *this;
+  }
+  auto operator--(int) -> batch_load_iterator<T>
+  {
+    batch_load_iterator<T> x(*this);
+    --cur_pos_;
+    return x;
+  }
+
+ private:
+  std::shared_ptr<value_type> cur_batch_;
+  size_type cur_pos_;
+};
+
 }  // namespace raft::spatial::knn::detail::utils
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 797dbaab50..7b3cf2d8f7 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,11 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include "../ball_cover_types.hpp"
 #include "ball_cover/common.cuh"
 #include "ball_cover/registers.cuh"
-#include "block_select_faiss.cuh"
 #include "haversine_distance.cuh"
 #include "knn_brute_force_faiss.cuh"
 #include "selection_faiss.cuh"
@@ -31,6 +30,8 @@
 
 #include <raft/util/cuda_utils.cuh>
 
+#include <raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh>
+
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/convert/csr.cuh>
@@ -38,8 +39,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <faiss/gpu/utils/Select.cuh>
-
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -65,7 +64,7 @@ namespace detail {
  * @param index
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void sample_landmarks(const raft::handle_t& handle,
+void sample_landmarks(raft::device_resources const& handle,
                       BallCoverIndex<value_idx, value_t, value_int>& index)
 {
   rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks, handle.get_stream());
@@ -117,7 +116,7 @@ void sample_landmarks(const raft::handle_t& handle,
  * @param index
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void construct_landmark_1nn(const raft::handle_t& handle,
+void construct_landmark_1nn(raft::device_resources const& handle,
                             const value_idx* R_knn_inds_ptr,
                             const value_t* R_knn_dists_ptr,
                             value_int k,
@@ -171,7 +170,7 @@ void construct_landmark_1nn(const raft::handle_t& handle,
  * @param R_knn_dists
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void k_closest_landmarks(const raft::handle_t& handle,
+void k_closest_landmarks(raft::device_resources const& handle,
                          const BallCoverIndex<value_idx, value_t, value_int>& index,
                          const value_t* query_pts,
                          value_int n_query_pts,
@@ -207,7 +206,7 @@ void k_closest_landmarks(const raft::handle_t& handle,
  * @param index
  */
 template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
-void compute_landmark_radii(const raft::handle_t& handle,
+void compute_landmark_radii(raft::device_resources const& handle,
                             BallCoverIndex<value_idx, value_t, value_int>& index)
 {
   auto entries = thrust::make_counting_iterator<value_idx>(0);
@@ -238,7 +237,7 @@ template <typename value_idx,
           typename value_t,
           typename value_int = std::uint32_t,
           typename dist_func>
-void perform_rbc_query(const raft::handle_t& handle,
+void perform_rbc_query(raft::device_resources const& handle,
                        const BallCoverIndex<value_idx, value_t, value_int>& index,
                        const value_t* query,
                        value_int n_query_pts,
@@ -340,7 +339,7 @@ template <typename value_idx = std::int64_t,
           typename value_t,
           typename value_int = std::uint32_t,
           typename distance_func>
-void rbc_build_index(const raft::handle_t& handle,
+void rbc_build_index(raft::device_resources const& handle,
                      BallCoverIndex<value_idx, value_t, value_int>& index,
                      distance_func dfunc)
 {
@@ -399,7 +398,7 @@ template <typename value_idx = std::int64_t,
           typename value_t,
           typename value_int = std::uint32_t,
           typename distance_func>
-void rbc_all_knn_query(const raft::handle_t& handle,
+void rbc_all_knn_query(raft::device_resources const& handle,
                        BallCoverIndex<value_idx, value_t, value_int>& index,
                        value_int k,
                        value_idx* inds,
@@ -468,7 +467,7 @@ template <typename value_idx = std::int64_t,
           typename value_t,
           typename value_int = std::uint32_t,
           typename distance_func>
-void rbc_knn_query(const raft::handle_t& handle,
+void rbc_knn_query(raft::device_resources const& handle,
                    const BallCoverIndex<value_idx, value_t, value_int>& index,
                    value_int k,
                    const value_t* query,
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index b09cf0da10..0a6718f5a5 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ struct EuclideanFunc : public DistFunc<value_t, value_int> {
       sum_sq += diff * diff;
     }
 
-    return sqrt(sum_sq);
+    return raft::sqrt(sum_sq);
   }
 };
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index a883a1eadd..394d27235b 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "common.cuh"
 
 #include "../../ball_cover_types.hpp"
-#include "../block_select_faiss.cuh"
+#include "../faiss_select/key_value_block_select.cuh"
 #include "../haversine_distance.cuh"
 #include "../selection_faiss.cuh"
 
@@ -28,9 +28,6 @@
 
 #include <raft/util/cuda_utils.cuh>
 
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-
 #include <thrust/fill.h>
 
 namespace raft {
@@ -172,10 +169,10 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
                                               dist_func dfunc,
                                               value_int* dist_counter)
 {
-  static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  static constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
+  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
   const value_t* x_ptr = X + (n_cols * blockIdx.x);
   value_t local_x_ptr[col_q];
@@ -183,21 +180,21 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
     local_x_ptr[j] = x_ptr[j];
   }
 
-  faiss::gpu::KeyValueBlockSelect<value_t,
-                                  value_idx,
-                                  false,
-                                  faiss::gpu::Comparator<value_t>,
-                                  warp_q,
-                                  thread_q,
-                                  tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(),
+  faiss_select::KeyValueBlockSelect<value_t,
+                                    value_idx,
+                                    false,
+                                    faiss_select::Comparator<value_t>,
+                                    warp_q,
+                                    thread_q,
+                                    tpb>
+    heap(std::numeric_limits<value_t>::max(),
+         std::numeric_limits<value_t>::max(),
          -1,
          shared_memK,
          shared_memV,
          k);
 
-  const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
+  const value_int n_k = Pow2<WarpSize>::roundDown(k);
   value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
@@ -224,7 +221,7 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+      const value_int limit = Pow2<WarpSize>::roundDown(R_size);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
@@ -334,10 +331,10 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
                                            distance_func dfunc,
                                            float weight = 1.0)
 {
-  static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  static constexpr value_int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
+  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
   // TODO: Separate kernels for different widths:
   // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
@@ -352,15 +349,15 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
   }
 
   // Each warp works on 1 R
-  faiss::gpu::KeyValueBlockSelect<value_t,
-                                  value_idx,
-                                  false,
-                                  faiss::gpu::Comparator<value_t>,
-                                  warp_q,
-                                  thread_q,
-                                  tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(),
+  faiss_select::KeyValueBlockSelect<value_t,
+                                    value_idx,
+                                    false,
+                                    faiss_select::Comparator<value_t>,
+                                    warp_q,
+                                    thread_q,
+                                    tpb>
+    heap(std::numeric_limits<value_t>::max(),
+         std::numeric_limits<value_t>::max(),
          -1,
          shared_memK,
          shared_memV,
@@ -390,7 +387,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+    value_int limit = Pow2<WarpSize>::roundDown(R_size);
     value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
@@ -470,7 +467,7 @@ template <typename value_idx,
           typename value_int = std::uint32_t,
           int dims           = 2,
           typename dist_func>
-void rbc_low_dim_pass_one(const raft::handle_t& handle,
+void rbc_low_dim_pass_one(raft::device_resources const& handle,
                           const BallCoverIndex<value_idx, value_t, value_int>& index,
                           const value_t* query,
                           const value_int n_query_rows,
@@ -602,7 +599,7 @@ template <typename value_idx,
           typename value_int = std::uint32_t,
           int dims           = 2,
           typename dist_func>
-void rbc_low_dim_pass_two(const raft::handle_t& handle,
+void rbc_low_dim_pass_two(raft::device_resources const& handle,
                           const BallCoverIndex<value_idx, value_t, value_int>& index,
                           const value_t* query,
                           const value_int n_query_rows,
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 19862d743d..7616083796 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
  private:
   DI void prolog()
   {
-    this->ldgXY(0);
+    this->ldgXY(IdxT(blockIdx.x) * P::Mblk, IdxT(blockIdx.y) * P::Nblk, 0);
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -74,18 +74,18 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
     }
     this->stsXY();
     __syncthreads();
-    this->pageWr ^= 1;
+    this->switch_write_buffer();
   }
 
   DI void loop()
   {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-      this->ldgXY(kidx);
+      this->ldgXY(IdxT(blockIdx.x) * P::Mblk, IdxT(blockIdx.y) * P::Nblk, kidx);
       accumulate();  // on the previous k-block
       this->stsXY();
       __syncthreads();
-      this->pageWr ^= 1;
-      this->pageRd ^= 1;
+      this->switch_write_buffer();
+      this->switch_read_buffer();
     }
     accumulate();  // last iteration
   }
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh
new file mode 100644
index 0000000000..173c06af30
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/Comparators.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+template <typename T>
+struct Comparator {
+  __device__ static inline bool lt(T a, T b) { return a < b; }
+
+  __device__ static inline bool gt(T a, T b) { return a > b; }
+};
+
+template <>
+struct Comparator<half> {
+  __device__ static inline bool lt(half a, half b) { return __hlt(a, b); }
+
+  __device__ static inline bool gt(half a, half b) { return __hgt(a, b); }
+};
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh
new file mode 100644
index 0000000000..d923b41ded
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh
@@ -0,0 +1,277 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh>
+#include <raft/spatial/knn/detail/faiss_select/StaticUtils.h>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+// Merge pairs of lists smaller than blockDim.x (NumThreads)
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool AllThreads,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+inline __device__ void blockMergeSmall(K* listK, V* listV)
+{
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2");
+  static_assert(L <= NumThreads, "merge list size must be <= NumThreads");
+
+  // Which pair of lists we are merging
+  int mergeId = threadIdx.x / L;
+
+  // Which thread we are within the merge
+  int tid = threadIdx.x % L;
+
+  // listK points to a region of size N * 2 * L
+  listK += 2 * L * mergeId;
+  listV += 2 * L * mergeId;
+
+  // It's not a bitonic merge, both lists are in the same direction,
+  // so handle the first swap assuming the second list is reversed
+  int pos    = L - 1 - tid;
+  int stride = 2 * tid + 1;
+
+  if (AllThreads || (threadIdx.x < N * L)) {
+    K ka = listK[pos];
+    K kb = listK[pos + stride];
+
+    bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    listK[pos]          = swap ? kb : ka;
+    listK[pos + stride] = swap ? ka : kb;
+
+    V va                = listV[pos];
+    V vb                = listV[pos + stride];
+    listV[pos]          = swap ? vb : va;
+    listV[pos + stride] = swap ? va : vb;
+
+    // FIXME: is this a CUDA 9 compiler bug?
+    // K& ka = listK[pos];
+    // K& kb = listK[pos + stride];
+
+    // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    // swap(s, ka, kb);
+
+    // V& va = listV[pos];
+    // V& vb = listV[pos + stride];
+    // swap(s, va, vb);
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int stride = L / 2; stride > 0; stride /= 2) {
+    int pos = 2 * tid - (tid & (stride - 1));
+
+    if (AllThreads || (threadIdx.x < N * L)) {
+      K ka = listK[pos];
+      K kb = listK[pos + stride];
+
+      bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      listK[pos]          = swap ? kb : ka;
+      listK[pos + stride] = swap ? ka : kb;
+
+      V va                = listV[pos];
+      V vb                = listV[pos + stride];
+      listV[pos]          = swap ? vb : va;
+      listV[pos + stride] = swap ? va : vb;
+
+      // FIXME: is this a CUDA 9 compiler bug?
+      // K& ka = listK[pos];
+      // K& kb = listK[pos + stride];
+
+      // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      // swap(s, ka, kb);
+
+      // V& va = listV[pos];
+      // V& vb = listV[pos + stride];
+      // swap(s, va, vb);
+    }
+
+    __syncthreads();
+  }
+}
+
+// Merge pairs of sorted lists larger than blockDim.x (NumThreads)
+template <int NumThreads, typename K, typename V, int L, bool Dir, typename Comp, bool FullMerge>
+inline __device__ void blockMergeLarge(K* listK, V* listV)
+{
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(L >= WarpSize, "merge list size must be >= 32");
+  static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2");
+  static_assert(L >= NumThreads, "merge list size must be >= NumThreads");
+
+  // For L > NumThreads, each thread has to perform more work
+  // per each stride.
+  constexpr int kLoopPerThread = L / NumThreads;
+
+  // It's not a bitonic merge, both lists are in the same direction,
+  // so handle the first swap assuming the second list is reversed
+#pragma unroll
+  for (int loop = 0; loop < kLoopPerThread; ++loop) {
+    int tid    = loop * NumThreads + threadIdx.x;
+    int pos    = L - 1 - tid;
+    int stride = 2 * tid + 1;
+
+    K ka = listK[pos];
+    K kb = listK[pos + stride];
+
+    bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    listK[pos]          = swap ? kb : ka;
+    listK[pos + stride] = swap ? ka : kb;
+
+    V va                = listV[pos];
+    V vb                = listV[pos + stride];
+    listV[pos]          = swap ? vb : va;
+    listV[pos + stride] = swap ? va : vb;
+
+    // FIXME: is this a CUDA 9 compiler bug?
+    // K& ka = listK[pos];
+    // K& kb = listK[pos + stride];
+
+    // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+    // swap(s, ka, kb);
+
+    // V& va = listV[pos];
+    // V& vb = listV[pos + stride];
+    // swap(s, va, vb);
+  }
+
+  __syncthreads();
+
+  constexpr int kSecondLoopPerThread = FullMerge ? kLoopPerThread : kLoopPerThread / 2;
+
+#pragma unroll
+  for (int stride = L / 2; stride > 0; stride /= 2) {
+#pragma unroll
+    for (int loop = 0; loop < kSecondLoopPerThread; ++loop) {
+      int tid = loop * NumThreads + threadIdx.x;
+      int pos = 2 * tid - (tid & (stride - 1));
+
+      K ka = listK[pos];
+      K kb = listK[pos + stride];
+
+      bool swap           = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      listK[pos]          = swap ? kb : ka;
+      listK[pos + stride] = swap ? ka : kb;
+
+      V va                = listV[pos];
+      V vb                = listV[pos + stride];
+      listV[pos]          = swap ? vb : va;
+      listV[pos + stride] = swap ? va : vb;
+
+      // FIXME: is this a CUDA 9 compiler bug?
+      // K& ka = listK[pos];
+      // K& kb = listK[pos + stride];
+
+      // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      // swap(s, ka, kb);
+
+      // V& va = listV[pos];
+      // V& vb = listV[pos + stride];
+      // swap(s, va, vb);
+    }
+
+    __syncthreads();
+  }
+}
+
+/// Class template to prevent static_assert from firing for
+/// mixing smaller/larger than block cases
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool SmallerThanBlock,
+          bool FullMerge>
+struct BlockMerge {
+};
+
+/// Merging lists smaller than a block
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, true, FullMerge> {
+  static inline __device__ void merge(K* listK, V* listV)
+  {
+    constexpr int kNumParallelMerges = NumThreads / L;
+    constexpr int kNumIterations     = N / kNumParallelMerges;
+
+    static_assert(L <= NumThreads, "list must be <= NumThreads");
+    static_assert((N < kNumParallelMerges) || (kNumIterations * kNumParallelMerges == N),
+                  "improper selection of N and L");
+
+    if (N < kNumParallelMerges) {
+      // We only need L threads per each list to perform the merge
+      blockMergeSmall<NumThreads, K, V, N, L, false, Dir, Comp, FullMerge>(listK, listV);
+    } else {
+      // All threads participate
+#pragma unroll
+      for (int i = 0; i < kNumIterations; ++i) {
+        int start = i * kNumParallelMerges * 2 * L;
+
+        blockMergeSmall<NumThreads, K, V, N, L, true, Dir, Comp, FullMerge>(listK + start,
+                                                                            listV + start);
+      }
+    }
+  }
+};
+
+/// Merging lists larger than a block
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge>
+struct BlockMerge<NumThreads, K, V, N, L, Dir, Comp, false, FullMerge> {
+  static inline __device__ void merge(K* listK, V* listV)
+  {
+    // Each pair of lists is merged sequentially
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      int start = i * 2 * L;
+
+      blockMergeLarge<NumThreads, K, V, L, Dir, Comp, FullMerge>(listK + start, listV + start);
+    }
+  }
+};
+
+template <int NumThreads,
+          typename K,
+          typename V,
+          int N,
+          int L,
+          bool Dir,
+          typename Comp,
+          bool FullMerge = true>
+inline __device__ void blockMerge(K* listK, V* listV)
+{
+  constexpr bool kSmallerThanBlock = (L <= NumThreads);
+
+  BlockMerge<NumThreads, K, V, N, L, Dir, Comp, kSmallerThanBlock, FullMerge>::merge(listK, listV);
+}
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh
new file mode 100644
index 0000000000..2cb01f9199
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh
@@ -0,0 +1,25 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+template <typename T>
+inline __device__ void swap(bool swap, T& x, T& y)
+{
+  T tmp = x;
+  x     = swap ? y : x;
+  y     = swap ? tmp : y;
+}
+
+template <typename T>
+inline __device__ void assign(bool assign, T& x, T y)
+{
+  x = assign ? y : x;
+}
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh
similarity index 51%
rename from cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
rename to cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh
index 2ce2d34cca..bce739b2d8 100644
--- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh
@@ -2,36 +2,31 @@
  * Copyright (c) Facebook, Inc. and its affiliates.
  *
  * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
  */
 
 #pragma once
 
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh>
+#include <raft/spatial/knn/detail/faiss_select/StaticUtils.h>
 
-#include <raft/core/kvp.hpp>
+#include <raft/util/cuda_utils.cuh>
 
-namespace faiss {
-namespace gpu {
-using raft::KeyValuePair;
+namespace raft::spatial::knn::detail::faiss_select {
 
 //
 // This file contains functions to:
 //
 // -perform bitonic merges on pairs of sorted lists, held in
-// registers. Each list contains N * kWarpSize (multiple of 32)
+// registers. Each list contains N * WarpSize (multiple of 32)
 // elements for some N.
 // The bitonic merge is implemented for arbitrary sizes;
-// sorted list A of size N1 * kWarpSize registers
-// sorted list B of size N2 * kWarpSize registers =>
-// sorted list C if size (N1 + N2) * kWarpSize registers. N1 and N2
+// sorted list A of size N1 * WarpSize registers
+// sorted list B of size N2 * WarpSize registers =>
+// sorted list C if size (N1 + N2) * WarpSize registers. N1 and N2
 // are >= 1 and don't have to be powers of 2.
 //
-// -perform bitonic sorts on a set of N * kWarpSize key/value pairs
+// -perform bitonic sorts on a set of N * WarpSize key/value pairs
 // held in registers, by using the above bitonic merge as a
 // primitive.
 // N can be an arbitrary N >= 1; i.e., the bitonic sort here supports
@@ -80,7 +75,7 @@ using raft::KeyValuePair;
 // performing both < and > comparisons with the variables, so I just
 // stick with this.
 
-// This function merges kWarpSize / 2L lists in parallel using warp
+// This function merges WarpSize / 2L lists in parallel using warp
 // shuffles.
 // It works on at most size-16 lists, as we need 32 threads for this
 // shuffle merge.
@@ -88,22 +83,19 @@ using raft::KeyValuePair;
 // If IsBitonic is false, the first stage is reversed, so we don't
 // need to sort directionally. It's still technically a bitonic sort.
 template <typename K, typename V, int L, bool Dir, typename Comp, bool IsBitonic>
-inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
+inline __device__ void warpBitonicMergeLE16(K& k, V& v)
 {
   static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
-  static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
+  static_assert(L <= WarpSize / 2, "merge list size must be <= 16");
 
-  int laneId = getLaneId();
+  int laneId = raft::laneId();
 
   if (!IsBitonic) {
     // Reverse the first comparison stage.
     // For example, merging a list of size 8 has the exchanges:
     // 0 <-> 15, 1 <-> 14, ...
-    K otherK  = shfl_xor(k, 2 * L - 1);
-    K otherVk = shfl_xor(v.key, 2 * L - 1);
-    V otherVv = shfl_xor(v.value, 2 * L - 1);
-
-    KeyValuePair<K, V> otherV = KeyValuePair(otherVk, otherVv);
+    K otherK = shfl_xor(k, 2 * L - 1);
+    V otherV = shfl_xor(v, 2 * L - 1);
 
     // Whether we are the lesser thread in the exchange
     bool small = !(laneId & L);
@@ -114,24 +106,19 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
       // alternatives in practice
       bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
       assign(s, k, otherK);
-      assign(s, v.key, otherV.key);
-      assign(s, v.value, otherV.value);
+      assign(s, v, otherV);
 
     } else {
       bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
       assign(s, k, otherK);
-      assign(s, v.value, otherV.value);
-      assign(s, v.key, otherV.key);
+      assign(s, v, otherV);
     }
   }
 
 #pragma unroll
   for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
-    K otherK  = shfl_xor(k, stride);
-    K otherVk = shfl_xor(v.key, stride);
-    V otherVv = shfl_xor(v.value, stride);
-
-    KeyValuePair<K, V> otherV = KeyValuePair(otherVk, otherVv);
+    K otherK = shfl_xor(k, stride);
+    V otherV = shfl_xor(v, stride);
 
     // Whether we are the lesser thread in the exchange
     bool small = !(laneId & stride);
@@ -139,14 +126,12 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
     if (Dir) {
       bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
       assign(s, k, otherK);
-      assign(s, v.key, otherV.key);
-      assign(s, v.value, otherV.value);
+      assign(s, v, otherV);
 
     } else {
       bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
       assign(s, k, otherK);
-      assign(s, v.key, otherV.key);
-      assign(s, v.value, otherV.value);
+      assign(s, v, otherV);
     }
   }
 }
@@ -154,7 +139,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
 // Template for performing a bitonic merge of an arbitrary set of
 // registers
 template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
-struct BitonicMergeStepKVP {
+struct BitonicMergeStep {
 };
 
 //
@@ -163,74 +148,69 @@ struct BitonicMergeStepKVP {
 
 // All merges eventually call this
 template <typename K, typename V, bool Dir, typename Comp, bool Low>
-struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1])
+struct BitonicMergeStep<K, V, 1, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[1], V v[1])
   {
     // Use warp shuffles
-    warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, true>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 16, Dir, Comp, true>(k[0], v[0]);
   }
 };
 
 template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
-struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+struct BitonicMergeStep<K, V, N, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[N], V v[N])
   {
     static_assert(utils::isPowerOf2(N), "must be power of 2");
     static_assert(N > 1, "must be N > 1");
 
 #pragma unroll
     for (int i = 0; i < N / 2; ++i) {
-      K& ka                  = k[i];
-      KeyValuePair<K, V>& va = v[i];
+      K& ka = k[i];
+      V& va = v[i];
 
-      K& kb                  = k[i + N / 2];
-      KeyValuePair<K, V>& vb = v[i + N / 2];
+      K& kb = k[i + N / 2];
+      V& vb = v[i + N / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
       swap(s, ka, kb);
-      swap(s, va.key, vb.key);
-      swap(s, va.value, vb.value);
+      swap(s, va, vb);
     }
 
     {
       K newK[N / 2];
-      KeyValuePair<K, V> newV[N / 2];
+      V newV[N / 2];
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        newK[i]       = k[i];
-        newV[i].key   = v[i].key;
-        newV[i].value = v[i].value;
+        newK[i] = k[i];
+        newV[i] = v[i];
       }
 
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
+      BitonicMergeStep<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        k[i]       = newK[i];
-        v[i].key   = newV[i].key;
-        v[i].value = newV[i].value;
+        k[i] = newK[i];
+        v[i] = newV[i];
       }
     }
 
     {
       K newK[N / 2];
-      KeyValuePair<K, V> newV[N / 2];
+      V newV[N / 2];
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        newK[i]       = k[i + N / 2];
-        newV[i].key   = v[i + N / 2].key;
-        newV[i].value = v[i + N / 2].value;
+        newK[i] = k[i + N / 2];
+        newV[i] = v[i + N / 2];
       }
 
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
+      BitonicMergeStep<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        k[i + N / 2]       = newK[i];
-        v[i + N / 2].key   = newV[i].key;
-        v[i + N / 2].value = newV[i].value;
+        k[i + N / 2] = newK[i];
+        v[i + N / 2] = newV[i];
       }
     }
   }
@@ -242,8 +222,8 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 
 // Low recursion
 template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+struct BitonicMergeStep<K, V, N, Dir, Comp, true, false> {
+  static inline __device__ void merge(K k[N], V v[N])
   {
     static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
     static_assert(N >= 3, "must be N >= 3");
@@ -252,77 +232,73 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
     for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka                  = k[i];
-      KeyValuePair<K, V>& va = v[i];
+      K& ka = k[i];
+      V& va = v[i];
 
-      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
-      KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      V& vb = v[i + kNextHighestPowerOf2 / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
       swap(s, ka, kb);
-      swap(s, va.key, vb.key);
-      swap(s, va.value, vb.value);
+      swap(s, va, vb);
     }
 
     constexpr int kLowSize  = N - kNextHighestPowerOf2 / 2;
     constexpr int kHighSize = kNextHighestPowerOf2 / 2;
     {
       K newK[kLowSize];
-      KeyValuePair<K, V> newV[kLowSize];
+      V newV[kLowSize];
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        newK[i]       = k[i];
-        newV[i].key   = v[i].key;
-        newV[i].value = v[i].value;
+        newK[i] = k[i];
+        newV[i] = v[i];
       }
 
       constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kLowSize,
-                          Dir,
-                          Comp,
-                          true,  // low
-                          kLowIsPowerOf2>::merge(newK, newV);
+      BitonicMergeStep<K,
+                       V,
+                       kLowSize,
+                       Dir,
+                       Comp,
+                       true,  // low
+                       kLowIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        k[i]       = newK[i];
-        v[i].key   = newV[i].key;
-        v[i].value = newV[i].value;
+        k[i] = newK[i];
+        v[i] = newV[i];
       }
     }
 
     {
       K newK[kHighSize];
-      KeyValuePair<K, V> newV[kHighSize];
+      V newV[kHighSize];
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        newK[i]       = k[i + kLowSize];
-        newV[i].key   = v[i + kLowSize].key;
-        newV[i].value = v[i + kLowSize].value;
+        newK[i] = k[i + kLowSize];
+        newV[i] = v[i + kLowSize];
       }
 
       constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
-      //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kHighSize,
-                          Dir,
-                          Comp,
-                          false,  // high
-                          kHighIsPowerOf2>::merge(newK, newV);
+      //      constexpr bool kHighIsPowerOf2 =
+      //      utils::isPowerOf2(kHighSize);
+      BitonicMergeStep<K,
+                       V,
+                       kHighSize,
+                       Dir,
+                       Comp,
+                       false,  // high
+                       kHighIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize]       = newK[i];
-        v[i + kLowSize].key   = newV[i].key;
-        v[i + kLowSize].value = newV[i].value;
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize] = newV[i];
       }
     }
   }
@@ -330,8 +306,8 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 // High recursion
 template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+struct BitonicMergeStep<K, V, N, Dir, Comp, false, false> {
+  static inline __device__ void merge(K k[N], V v[N])
   {
     static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
     static_assert(N >= 3, "must be N >= 3");
@@ -340,149 +316,137 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
     for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka                  = k[i];
-      KeyValuePair<K, V>& va = v[i];
+      K& ka = k[i];
+      V& va = v[i];
 
-      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
-      KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      V& vb = v[i + kNextHighestPowerOf2 / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
       swap(s, ka, kb);
-      swap(s, va.key, vb.key);
-      swap(s, va.value, vb.value);
+      swap(s, va, vb);
     }
 
     constexpr int kLowSize  = kNextHighestPowerOf2 / 2;
     constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
     {
       K newK[kLowSize];
-      KeyValuePair<K, V> newV[kLowSize];
+      V newV[kLowSize];
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        newK[i]       = k[i];
-        newV[i].key   = v[i].key;
-        newV[i].value = v[i].value;
+        newK[i] = k[i];
+        newV[i] = v[i];
       }
 
       constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kLowSize,
-                          Dir,
-                          Comp,
-                          true,  // low
-                          kLowIsPowerOf2>::merge(newK, newV);
+      BitonicMergeStep<K,
+                       V,
+                       kLowSize,
+                       Dir,
+                       Comp,
+                       true,  // low
+                       kLowIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        k[i]       = newK[i];
-        v[i].key   = newV[i].key;
-        v[i].value = newV[i].value;
+        k[i] = newK[i];
+        v[i] = newV[i];
       }
     }
 
     {
       K newK[kHighSize];
-      KeyValuePair<K, V> newV[kHighSize];
+      V newV[kHighSize];
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        newK[i]       = k[i + kLowSize];
-        newV[i].key   = v[i + kLowSize].key;
-        newV[i].value = v[i + kLowSize].value;
+        newK[i] = k[i + kLowSize];
+        newV[i] = v[i + kLowSize];
       }
 
       constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
-      //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K,
-                          V,
-                          kHighSize,
-                          Dir,
-                          Comp,
-                          false,  // high
-                          kHighIsPowerOf2>::merge(newK, newV);
+      //      constexpr bool kHighIsPowerOf2 =
+      //      utils::isPowerOf2(kHighSize);
+      BitonicMergeStep<K,
+                       V,
+                       kHighSize,
+                       Dir,
+                       Comp,
+                       false,  // high
+                       kHighIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize]       = newK[i];
-        v[i + kLowSize].key   = newV[i].key;
-        v[i + kLowSize].value = newV[i].value;
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize] = newV[i];
       }
     }
   }
 };
 
 /// Merges two sets of registers across the warp of any size;
-/// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
-/// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
+/// i.e., merges a sorted k/v list of size WarpSize * N1 with a
+/// sorted k/v list of size WarpSize * N2, where N1 and N2 are any
 /// value >= 1
 template <typename K, typename V, int N1, int N2, bool Dir, typename Comp, bool FullMerge = true>
-inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
-                                                KeyValuePair<K, V> v1[N1],
-                                                K k2[N2],
-                                                KeyValuePair<K, V> v2[N2])
+inline __device__ void warpMergeAnyRegisters(K k1[N1], V v1[N1], K k2[N2], V v2[N2])
 {
   constexpr int kSmallestN = N1 < N2 ? N1 : N2;
 
 #pragma unroll
   for (int i = 0; i < kSmallestN; ++i) {
-    K& ka                  = k1[N1 - 1 - i];
-    KeyValuePair<K, V>& va = v1[N1 - 1 - i];
+    K& ka = k1[N1 - 1 - i];
+    V& va = v1[N1 - 1 - i];
 
-    K& kb                  = k2[i];
-    KeyValuePair<K, V>& vb = v2[i];
+    K& kb = k2[i];
+    V& vb = v2[i];
 
     K otherKa;
-    KeyValuePair<K, V> otherVa;
+    V otherVa;
 
     if (FullMerge) {
       // We need the other values
-      otherKa    = shfl_xor(ka, kWarpSize - 1);
-      K otherVak = shfl_xor(va.key, kWarpSize - 1);
-      V otherVav = shfl_xor(va.value, kWarpSize - 1);
-      otherVa    = KeyValuePair(otherVak, otherVav);
+      otherKa = shfl_xor(ka, WarpSize - 1);
+      otherVa = shfl_xor(va, WarpSize - 1);
     }
 
-    K otherKb  = shfl_xor(kb, kWarpSize - 1);
-    K otherVbk = shfl_xor(vb.key, kWarpSize - 1);
-    V otherVbv = shfl_xor(vb.value, kWarpSize - 1);
+    K otherKb = shfl_xor(kb, WarpSize - 1);
+    V otherVb = shfl_xor(vb, WarpSize - 1);
 
     // ka is always first in the list, so we needn't use our lane
     // in this comparison
     bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb);
     assign(swapa, ka, otherKb);
-    assign(swapa, va.key, otherVbk);
-    assign(swapa, va.value, otherVbv);
+    assign(swapa, va, otherVb);
 
     // kb is always second in the list, so we needn't use our lane
     // in this comparison
     if (FullMerge) {
       bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa);
       assign(swapb, kb, otherKa);
-      assign(swapb, vb.key, otherVa.key);
-      assign(swapb, vb.value, otherVa.value);
+      assign(swapb, vb, otherVa);
 
     } else {
       // We don't care about updating elements in the second list
     }
   }
 
-  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(k1, v1);
+  BitonicMergeStep<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(k1, v1);
   if (FullMerge) {
     // Only if we care about N2 do we need to bother merging it fully
-    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::merge(k2, v2);
+    BitonicMergeStep<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::merge(k2, v2);
   }
 }
 
 // Recursive template that uses the above bitonic merge to perform a
 // bitonic sort
 template <typename K, typename V, int N, bool Dir, typename Comp>
-struct BitonicSortStepKVP {
-  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N])
+struct BitonicSortStep {
+  static inline __device__ void sort(K k[N], V v[N])
   {
     static_assert(N > 1, "did not hit specialized case");
 
@@ -491,71 +455,67 @@ struct BitonicSortStepKVP {
     constexpr int kSizeB = N - kSizeA;
 
     K aK[kSizeA];
-    KeyValuePair<K, V> aV[kSizeA];
+    V aV[kSizeA];
 
 #pragma unroll
     for (int i = 0; i < kSizeA; ++i) {
-      aK[i]       = k[i];
-      aV[i].key   = v[i].key;
-      aV[i].value = v[i].value;
+      aK[i] = k[i];
+      aV[i] = v[i];
     }
 
-    BitonicSortStepKVP<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
+    BitonicSortStep<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
 
     K bK[kSizeB];
-    KeyValuePair<K, V> bV[kSizeB];
+    V bV[kSizeB];
 
 #pragma unroll
     for (int i = 0; i < kSizeB; ++i) {
-      bK[i]       = k[i + kSizeA];
-      bV[i].key   = v[i + kSizeA].key;
-      bV[i].value = v[i + kSizeA].value;
+      bK[i] = k[i + kSizeA];
+      bV[i] = v[i + kSizeA];
     }
 
-    BitonicSortStepKVP<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
+    BitonicSortStep<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
 
     // Merge halves
-    warpMergeAnyRegistersKVP<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
+    warpMergeAnyRegisters<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
 
 #pragma unroll
     for (int i = 0; i < kSizeA; ++i) {
-      k[i]       = aK[i];
-      v[i].key   = aV[i].key;
-      v[i].value = aV[i].value;
+      k[i] = aK[i];
+      v[i] = aV[i];
     }
 
 #pragma unroll
     for (int i = 0; i < kSizeB; ++i) {
-      k[i + kSizeA]       = bK[i];
-      v[i + kSizeA].key   = bV[i].key;
-      v[i + kSizeA].value = bV[i].value;
+      k[i + kSizeA] = bK[i];
+      v[i + kSizeA] = bV[i];
     }
   }
 };
 
 // Single warp (N == 1) sorting specialization
 template <typename K, typename V, bool Dir, typename Comp>
-struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
-  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1])
+struct BitonicSortStep<K, V, 1, Dir, Comp> {
+  static inline __device__ void sort(K k[1], V v[1])
   {
     // Update this code if this changes
-    // should go from 1 -> kWarpSize in multiples of 2
-    static_assert(kWarpSize == 32, "unexpected warp size");
-
-    warpBitonicMergeLE16KVP<K, V, 1, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 2, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 4, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 8, Dir, Comp, false>(k[0], v[0]);
-    warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, false>(k[0], v[0]);
+    // should go from 1 -> WarpSize in multiples of 2
+    static_assert(WarpSize == 32, "unexpected warp size");
+
+    warpBitonicMergeLE16<K, V, 1, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 2, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 4, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 8, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16<K, V, 16, Dir, Comp, false>(k[0], v[0]);
   }
 };
 
-/// Sort a list of kWarpSize * N elements in registers, where N is an
+/// Sort a list of WarpSize * N elements in registers, where N is an
 /// arbitrary >= 1
 template <typename K, typename V, int N, bool Dir, typename Comp>
-inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair<K, V> v[N])
+inline __device__ void warpSortAnyRegisters(K k[N], V v[N])
 {
-  BitonicSortStepKVP<K, V, N, Dir, Comp>::sort(k, v);
+  BitonicSortStep<K, V, N, Dir, Comp>::sort(k, v);
 }
-}  // namespace gpu
-}  // namespace faiss
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh
new file mode 100644
index 0000000000..e4faff7a6c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/Select.cuh
@@ -0,0 +1,555 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/faiss_select/Comparators.cuh>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkBlock.cuh>
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkWarp.cuh>
+
+#include <raft/core/kvp.hpp>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::spatial::knn::detail::faiss_select {
+
+// Specialization for block-wide monotonic merges producing a merge sort
+// since what we really want is a constexpr loop expansion
+template <int NumWarps,
+          int NumThreads,
+          typename K,
+          typename V,
+          int NumWarpQ,
+          bool Dir,
+          typename Comp>
+struct FinalBlockMerge {
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    // no merge required; single warp
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 2), NumWarpQ, !Dir, Comp, false>(sharedK,
+                                                                                           sharedV);
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 2), NumWarpQ, !Dir, Comp>(sharedK,
+                                                                                    sharedV);
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 4), NumWarpQ * 2, !Dir, Comp, false>(
+      sharedK, sharedV);
+  }
+};
+
+template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
+struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> {
+  static inline __device__ void merge(K* sharedK, V* sharedV)
+  {
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 2), NumWarpQ, !Dir, Comp>(sharedK,
+                                                                                    sharedV);
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 4), NumWarpQ * 2, !Dir, Comp>(sharedK,
+                                                                                        sharedV);
+    // Final merge doesn't need to fully merge the second list
+    blockMerge<NumThreads, K, V, NumThreads / (WarpSize * 8), NumWarpQ * 4, !Dir, Comp, false>(
+      sharedK, sharedV);
+  }
+};
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct BlockSelect {
+  static constexpr int kNumWarps          = ThreadsPerBlock / WarpSize;
+  static constexpr int kTotalWarpSortSize = NumWarpQ;
+
+  __device__ inline BlockSelect(K initKVal, V initVVal, K* smemK, V* smemV, int k)
+    : initK(initKVal),
+      initV(initVVal),
+      numVals(0),
+      warpKTop(initKVal),
+      sharedK(smemK),
+      sharedV(smemV),
+      kMinus1(k - 1)
+  {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    int laneId = raft::laneId();
+    int warpId = threadIdx.x / WarpSize;
+    warpK      = sharedK + warpId * kTotalWarpSortSize;
+    warpV      = sharedV + warpId * kTotalWarpSortSize;
+
+    // Fill warp queue (only the actual queue space is fine, not where
+    // we write the per-thread queues for merging)
+    for (int i = laneId; i < NumWarpQ; i += WarpSize) {
+      warpK[i] = initK;
+      warpV[i] = initV;
+    }
+
+    warpFence();
+  }
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i] = threadV[i - 1];
+      }
+
+      threadK[0] = k;
+      threadV[0] = v;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    // This has a trailing warpFence
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // We have to beat at least this element
+    warpKTop = warpK[kMinus1];
+
+    warpFence();
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ()
+  {
+    int laneId = raft::laneId();
+
+    // Sort all of the per-thread queues
+    warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize;
+    K warpKRegisters[kNumWarpQRegisters];
+    V warpVRegisters[kNumWarpQRegisters];
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpKRegisters[i] = warpK[i * WarpSize + laneId];
+      warpVRegisters[i] = warpV[i * WarpSize + laneId];
+    }
+
+    warpFence();
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegisters<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpKRegisters, warpVRegisters, threadK, threadV);
+
+    // Write back out the warp queue
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i * WarpSize + laneId] = warpKRegisters[i];
+      warpV[i * WarpSize + laneId] = warpVRegisters[i];
+    }
+
+    warpFence();
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, V v)
+  {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce()
+  {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+
+    // block-wide dep; thus far, all warps have been completely
+    // independent
+    __syncthreads();
+
+    // All warp queues are contiguous in smem.
+    // Now, we have kNumWarps lists of NumWarpQ elements.
+    // This is a power of 2.
+    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, V, NumWarpQ, Dir, Comp>::merge(sharedK, sharedV);
+
+    // The block-wide merge has a trailing syncthreads
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const V initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  V threadV[NumThreadQ];
+
+  // Queues for all warps
+  K* sharedK;
+  V* sharedV;
+
+  // Our warp's queue (points into sharedK/sharedV)
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K* warpK;
+  V* warpV;
+
+  // This is a cached k-1 value
+  int kMinus1;
+};
+
+/// Specialization for k == 1 (NumWarpQ == 1)
+template <typename K, typename V, bool Dir, typename Comp, int NumThreadQ, int ThreadsPerBlock>
+struct BlockSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
+  static constexpr int kNumWarps = ThreadsPerBlock / WarpSize;
+
+  __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k)
+    : threadK(initK), threadV(initV), sharedK(smemK), sharedV(smemV)
+  {
+  }
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+    threadK   = swap ? k : threadK;
+    threadV   = swap ? v : threadV;
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    // We don't need to do anything here, since the warp doesn't
+    // cooperate until the end
+  }
+
+  __device__ inline void add(K k, V v) { addThreadQ(k, v); }
+
+  __device__ inline void reduce()
+  {
+    // Reduce within the warp
+    KeyValuePair<K, V> pair(threadK, threadV);
+
+    if (Dir) {
+      pair = warpReduce(pair, max_op{});
+    } else {
+      pair = warpReduce(pair, min_op{});
+    }
+
+    // Each warp writes out a single value
+    int laneId = raft::laneId();
+    int warpId = threadIdx.x / WarpSize;
+
+    if (laneId == 0) {
+      sharedK[warpId] = pair.key;
+      sharedV[warpId] = pair.value;
+    }
+
+    __syncthreads();
+
+    // We typically use this for small blocks (<= 128), just having the
+    // first thread in the block perform the reduction across warps is
+    // faster
+    if (threadIdx.x == 0) {
+      threadK = sharedK[0];
+      threadV = sharedV[0];
+
+#pragma unroll
+      for (int i = 1; i < kNumWarps; ++i) {
+        K k = sharedK[i];
+        V v = sharedV[i];
+
+        bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+        threadK   = swap ? k : threadK;
+        threadV   = swap ? v : threadV;
+      }
+
+      // Hopefully a thread's smem reads/writes are ordered wrt
+      // itself, so no barrier needed :)
+      sharedK[0] = threadK;
+      sharedV[0] = threadV;
+    }
+
+    // In case other threads wish to read this value
+    __syncthreads();
+  }
+
+  // threadK is lowest (Dir) or highest (!Dir)
+  K threadK;
+  V threadV;
+
+  // Where we reduce in smem
+  K* sharedK;
+  V* sharedV;
+};
+
+//
+// per-warp WarpSelect
+//
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+struct WarpSelect {
+  static constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize;
+
+  __device__ inline WarpSelect(K initKVal, V initVVal, int k)
+    : initK(initKVal), initV(initVVal), numVals(0), warpKTop(initKVal), kLane((k - 1) % WarpSize)
+  {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // Fill the warp queue with the default value
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i] = initK;
+      warpV[i] = initV;
+    }
+  }
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i] = threadV[i - 1];
+      }
+
+      threadK[0] = k;
+      threadV[0] = v;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i] = initV;
+    }
+
+    // We have to beat at least this element
+    warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ()
+  {
+    // Sort all of the per-thread queues
+    warpSortAnyRegisters<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegisters<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpK, warpV, threadK, threadV);
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, V v)
+  {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce()
+  {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k)
+  {
+    int laneId = raft::laneId();
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      int idx = i * WarpSize + laneId;
+
+      if (idx < k) {
+        outK[idx] = warpK[i];
+        outV[idx] = warpV[i];
+      }
+    }
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const V initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  V threadV[NumThreadQ];
+
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K warpK[kNumWarpQRegisters];
+  V warpV[kNumWarpQRegisters];
+
+  // This is what lane we should load an approximation (>=k) to the
+  // kth element from the last register in the warp queue (i.e.,
+  // warpK[kNumWarpQRegisters - 1]).
+  int kLane;
+};
+
+/// Specialization for k == 1 (NumWarpQ == 1)
+template <typename K, typename V, bool Dir, typename Comp, int NumThreadQ, int ThreadsPerBlock>
+struct WarpSelect<K, V, Dir, Comp, 1, NumThreadQ, ThreadsPerBlock> {
+  static constexpr int kNumWarps = ThreadsPerBlock / WarpSize;
+
+  __device__ inline WarpSelect(K initK, V initV, int k) : threadK(initK), threadV(initV) {}
+
+  __device__ inline void addThreadQ(K k, V v)
+  {
+    bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK);
+    threadK   = swap ? k : threadK;
+    threadV   = swap ? v : threadV;
+  }
+
+  __device__ inline void checkThreadQ()
+  {
+    // We don't need to do anything here, since the warp doesn't
+    // cooperate until the end
+  }
+
+  __device__ inline void add(K k, V v) { addThreadQ(k, v); }
+
+  __device__ inline void reduce()
+  {
+    // Reduce within the warp
+    KeyValuePair<K, V> pair(threadK, threadV);
+
+    if (Dir) {
+      pair = warpReduce(pair, max_op{});
+    } else {
+      pair = warpReduce(pair, min_op{});
+    }
+
+    threadK = pair.key;
+    threadV = pair.value;
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k)
+  {
+    if (raft::laneId() == 0) {
+      *outK = threadK;
+      *outV = threadV;
+    }
+  }
+
+  // threadK is lowest (Dir) or highest (!Dir)
+  K threadK;
+  V threadV;
+};
+
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h b/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h
new file mode 100644
index 0000000000..bac051b68c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/StaticUtils.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+// allow usage for non-CUDA files
+#ifndef __host__
+#define __host__
+#define __device__
+#endif
+
+namespace raft::spatial::knn::detail::faiss_select::utils {
+
+template <typename T>
+constexpr __host__ __device__ bool isPowerOf2(T v)
+{
+  return (v && !(v & (v - 1)));
+}
+
+static_assert(isPowerOf2(2048), "isPowerOf2");
+static_assert(!isPowerOf2(3333), "isPowerOf2");
+
+template <typename T>
+constexpr __host__ __device__ T nextHighestPowerOf2(T v)
+{
+  return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + 1)));
+}
+
+static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2");
+
+static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
+
+static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, "nextHighestPowerOf2");
+static_assert(nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL,
+              "nextHighestPowerOf2");
+
+}  // namespace raft::spatial::knn::detail::faiss_select::utils
diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh
similarity index 80%
rename from cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
rename to cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh
index 34240fba64..617a26a243 100644
--- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/faiss_select/key_value_block_select.cuh
@@ -2,26 +2,19 @@
  * Copyright (c) Facebook, Inc. and its affiliates.
  *
  * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * LICENSE file thirdparty/LICENSES/LICENSE.faiss
  */
 
 #pragma once
 
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
-#include <faiss/gpu/utils/PtxUtils.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/utils/WarpShuffles.cuh>
-
-#include "warp_select_faiss.cuh"
+#include <raft/spatial/knn/detail/faiss_select/MergeNetworkUtils.cuh>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 
 // TODO: Need to think further about the impact (and new boundaries created) on the registers
 // because this will change the max k that can be processed. One solution might be to break
 // up k into multiple batches for larger k.
 
-namespace faiss {
-namespace gpu {
+namespace raft::spatial::knn::detail::faiss_select {
 
 // `Dir` true, produce largest values.
 // `Dir` false, produce smallest values.
@@ -33,7 +26,7 @@ template <typename K,
           int NumThreadQ,
           int ThreadsPerBlock>
 struct KeyValueBlockSelect {
-  static constexpr int kNumWarps          = ThreadsPerBlock / kWarpSize;
+  static constexpr int kNumWarps          = ThreadsPerBlock / WarpSize;
   static constexpr int kTotalWarpSortSize = NumWarpQ;
 
   __device__ inline KeyValueBlockSelect(
@@ -59,14 +52,14 @@ struct KeyValueBlockSelect {
       threadV[i].value = initVv;
     }
 
-    int laneId = getLaneId();
-    int warpId = threadIdx.x / kWarpSize;
+    int laneId = raft::laneId();
+    int warpId = threadIdx.x / WarpSize;
     warpK      = sharedK + warpId * kTotalWarpSortSize;
     warpV      = sharedV + warpId * kTotalWarpSortSize;
 
     // Fill warp queue (only the actual queue space is fine, not where
     // we write the per-thread queues for merging)
-    for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
+    for (int i = laneId; i < NumWarpQ; i += WarpSize) {
       warpK[i]       = initK;
       warpV[i].key   = initVk;
       warpV[i].value = initVv;
@@ -134,20 +127,20 @@ struct KeyValueBlockSelect {
   /// list across both
   __device__ inline void mergeWarpQ()
   {
-    int laneId = getLaneId();
+    int laneId = raft::laneId();
 
     // Sort all of the per-thread queues
-    warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+    warpSortAnyRegisters<K, KeyValuePair<K, V>, NumThreadQ, !Dir, Comp>(threadK, threadV);
 
-    constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
+    constexpr int kNumWarpQRegisters = NumWarpQ / WarpSize;
     K warpKRegisters[kNumWarpQRegisters];
     KeyValuePair<K, V> warpVRegisters[kNumWarpQRegisters];
 
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpKRegisters[i]       = warpK[i * kWarpSize + laneId];
-      warpVRegisters[i].key   = warpV[i * kWarpSize + laneId].key;
-      warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value;
+      warpKRegisters[i]       = warpK[i * WarpSize + laneId];
+      warpVRegisters[i].key   = warpV[i * WarpSize + laneId].key;
+      warpVRegisters[i].value = warpV[i * WarpSize + laneId].value;
     }
 
     warpFence();
@@ -155,15 +148,15 @@ struct KeyValueBlockSelect {
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
     // one sorted list
-    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+    warpMergeAnyRegisters<K, KeyValuePair<K, V>, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
       warpKRegisters, warpVRegisters, threadK, threadV);
 
     // Write back out the warp queue
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i * kWarpSize + laneId]       = warpKRegisters[i];
-      warpV[i * kWarpSize + laneId].key   = warpVRegisters[i].key;
-      warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value;
+      warpK[i * WarpSize + laneId]       = warpKRegisters[i];
+      warpV[i * WarpSize + laneId].key   = warpVRegisters[i].key;
+      warpV[i * WarpSize + laneId].value = warpVRegisters[i].value;
     }
 
     warpFence();
@@ -228,5 +221,4 @@ struct KeyValueBlockSelect {
   int kMinus1;
 };
 
-}  // namespace gpu
-}  // namespace faiss
\ No newline at end of file
+}  // namespace raft::spatial::knn::detail::faiss_select
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 85a05877f1..f1f160a154 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,9 @@
  */
 #pragma once
 #include <cub/cub.cuh>
-#include <faiss/gpu/utils/Select.cuh>
 #include <limits>
 #include <raft/linalg/norm.cuh>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include "processing.cuh"
 #include <raft/core/operators.hpp>
@@ -219,8 +219,8 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
   constexpr auto identity = std::numeric_limits<AccT>::max();
   constexpr auto keyMax   = std::numeric_limits<uint32_t>::max();
   constexpr auto Dir      = false;
-  typedef faiss::gpu::
-    WarpSelect<AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
+  typedef faiss_select::
+    WarpSelect<AccT, uint32_t, Dir, faiss_select::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
       myWarpSelect;
 
   auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__(
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 333fc1c573..7d361ba4fb 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,11 @@
 
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
 
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 
 namespace raft {
 namespace spatial {
@@ -33,11 +32,11 @@ namespace detail {
 template <typename value_t>
 DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
 {
-  value_t sin_0 = sin(0.5 * (x1 - y1));
-  value_t sin_1 = sin(0.5 * (x2 - y2));
-  value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
+  value_t sin_0 = raft::sin(0.5 * (x1 - y1));
+  value_t sin_1 = raft::sin(0.5 * (x2 - y2));
+  value_t rdist = sin_0 * sin_0 + raft::cos(x1) * raft::cos(y1) * sin_1 * sin_1;
 
-  return 2 * asin(sqrt(rdist));
+  return 2 * raft::asin(raft::sqrt(rdist));
 }
 
 /**
@@ -61,21 +60,21 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
                                      size_t n_index_rows,
                                      int k)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
 
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
-      heap(faiss::gpu::Limits<value_t>::getMax(),
+  faiss_select::
+    BlockSelect<value_t, value_idx, false, faiss_select::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(std::numeric_limits<value_t>::max(),
            std::numeric_limits<value_idx>::max(),
            smemK,
            smemV,
            k);
 
   // Grid is exactly sized to rows available
-  int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
+  int limit = Pow2<WarpSize>::roundDown(n_index_rows);
 
   const value_t* query_ptr = query + (blockIdx.x * 2);
   value_t x1               = query_ptr[0];
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index e951d8fe5d..08a7a461a4 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,18 +17,18 @@
 #pragma once
 
 #include "../ivf_flat_types.hpp"
-#include "ann_kmeans_balanced.cuh"
 #include "ann_serialization.h"
 #include "ann_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/pow2_utils.cuh>
 
@@ -116,7 +116,7 @@ __global__ void build_index_kernel(const LabelT* labels,
 
 /** See raft::spatial::knn::ivf_flat::extend docs */
 template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
+inline auto extend(raft::device_resources const& handle,
                    const index<T, IdxT>& orig_index,
                    const T* new_vectors,
                    const IdxT* new_indices,
@@ -134,15 +134,18 @@ inline auto extend(const handle_t& handle,
                "You must pass data indices when the index is non-empty.");
 
   rmm::device_uvector<LabelT> new_labels(n_rows, stream);
-  kmeans::predict<T, IdxT, LabelT>(handle,
-                                   orig_index.centers().data_handle(),
-                                   n_lists,
-                                   dim,
-                                   new_vectors,
-                                   n_rows,
-                                   new_labels.data(),
-                                   orig_index.metric(),
-                                   stream);
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.metric     = orig_index.metric();
+  auto new_vectors_view    = raft::make_device_matrix_view<const T, IdxT>(new_vectors, n_rows, dim);
+  auto orig_centroids_view = raft::make_device_matrix_view<const float, IdxT>(
+    orig_index.centers().data_handle(), n_lists, dim);
+  auto labels_view = raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_rows);
+  raft::cluster::kmeans_balanced::predict(handle,
+                                          kmeans_params,
+                                          new_vectors_view,
+                                          orig_centroids_view,
+                                          labels_view,
+                                          utils::mapping<float>{});
 
   index<T, IdxT> ext_index(
     handle, orig_index.metric(), n_lists, orig_index.adaptive_centers(), dim);
@@ -157,16 +160,19 @@ inline auto extend(const handle_t& handle,
   if (ext_index.adaptive_centers()) {
     raft::copy(
       list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream);
-    kmeans::calc_centers_and_sizes(handle,
-                                   centers_ptr,
-                                   list_sizes_ptr,
-                                   n_lists,
-                                   dim,
-                                   new_vectors,
-                                   n_rows,
-                                   new_labels.data(),
-                                   false,
-                                   stream);
+    auto centroids_view = raft::make_device_matrix_view<float, IdxT>(centers_ptr, n_lists, dim);
+    auto list_sizes_view =
+      raft::make_device_vector_view<std::remove_pointer_t<decltype(list_sizes_ptr)>, IdxT>(
+        list_sizes_ptr, n_lists);
+    auto const_labels_view =
+      raft::make_device_vector_view<const LabelT, IdxT>(new_labels.data(), n_rows);
+    raft::cluster::kmeans_balanced::helpers::calc_centers_and_sizes(handle,
+                                                                    new_vectors_view,
+                                                                    const_labels_view,
+                                                                    centroids_view,
+                                                                    list_sizes_view,
+                                                                    false,
+                                                                    utils::mapping<float>{});
   } else {
     raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
                                            reinterpret_cast<int32_t*>(list_sizes_ptr),
@@ -191,8 +197,7 @@ inline auto extend(const handle_t& handle,
   update_host(&index_size, list_offsets_ptr + n_lists, 1, stream);
   handle.sync_stream(stream);
 
-  ext_index.allocate(
-    handle, index_size, ext_index.metric() == raft::distance::DistanceType::L2Expanded);
+  ext_index.allocate(handle, index_size);
 
   // Populate index with the old data
   if (orig_index.size() > 0) {
@@ -246,8 +251,7 @@ inline auto extend(const handle_t& handle,
                             n_lists,
                             raft::linalg::L2Norm,
                             true,
-                            stream,
-                            raft::sqrt_op());
+                            stream);
       RAFT_LOG_TRACE_VEC(ext_index.center_norms()->data_handle(), std::min<uint32_t>(dim, 20));
     }
   }
@@ -258,9 +262,11 @@ inline auto extend(const handle_t& handle,
 
 /** See raft::spatial::knn::ivf_flat::build docs */
 template <typename T, typename IdxT>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<T, IdxT>
+inline auto build(raft::device_resources const& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim) -> index<T, IdxT>
 {
   auto stream = handle.get_stream();
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
@@ -288,15 +294,15 @@ inline auto build(
                                     n_rows_train,
                                     cudaMemcpyDefault,
                                     stream));
-    kmeans::build_hierarchical<T, IdxT>(handle,
-                                        params.kmeans_n_iters,
-                                        index.dim(),
-                                        trainset.data(),
-                                        n_rows_train,
-                                        index.centers().data_handle(),
-                                        index.n_lists(),
-                                        index.metric(),
-                                        stream);
+    auto trainset_const_view =
+      raft::make_device_matrix_view<const T, IdxT>(trainset.data(), n_rows_train, index.dim());
+    auto centers_view = raft::make_device_matrix_view<float, IdxT>(
+      index.centers().data_handle(), index.n_lists(), index.dim());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = index.metric();
+    raft::cluster::kmeans_balanced::fit(
+      handle, kmeans_params, trainset_const_view, centers_view, utils::mapping<float>{});
   }
 
   // add the data if necessary
@@ -325,7 +331,7 @@ inline auto build(
  * @param[in] n_candidates  of neighbor_candidates
  */
 template <typename T, typename IdxT>
-inline void fill_refinement_index(const handle_t& handle,
+inline void fill_refinement_index(raft::device_resources const& handle,
                                   index<T, IdxT>* refinement_index,
                                   const T* dataset,
                                   const IdxT* candidate_idx,
@@ -340,27 +346,27 @@ inline void fill_refinement_index(const handle_t& handle,
     "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));
 
   rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
-  linalg::writeOnlyUnaryOp(
-    new_labels.data(),
-    n_queries * n_candidates,
-    [n_candidates] __device__(LabelT * out, uint32_t i) { *out = i / n_candidates; },
-    stream);
+  auto new_labels_view =
+    raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
+  linalg::map_offset(
+    handle,
+    new_labels_view,
+    raft::compose_op(raft::cast_op<LabelT>(), raft::div_const_op<IdxT>(n_candidates)));
 
   auto list_sizes_ptr   = refinement_index->list_sizes().data_handle();
   auto list_offsets_ptr = refinement_index->list_offsets().data_handle();
   // We do not fill centers and center norms, since we will not run coarse search.
 
   // Calculate new offsets
-  uint32_t n_roundup = Pow2<kIndexGroupSize>::roundUp(n_candidates);
-  linalg::writeOnlyUnaryOp(
-    refinement_index->list_offsets().data_handle(),
-    refinement_index->list_offsets().size(),
-    [n_roundup] __device__(IdxT * out, uint32_t i) { *out = i * n_roundup; },
-    stream);
+  uint32_t n_roundup     = Pow2<kIndexGroupSize>::roundUp(n_candidates);
+  auto list_offsets_view = raft::make_device_vector_view<IdxT, IdxT>(
+    list_offsets_ptr, refinement_index->list_offsets().size());
+  linalg::map_offset(handle,
+                     list_offsets_view,
+                     raft::compose_op(raft::cast_op<IdxT>(), raft::mul_const_op<IdxT>(n_roundup)));
 
   IdxT index_size = n_roundup * n_lists;
-  refinement_index->allocate(
-    handle, index_size, refinement_index->metric() == raft::distance::DistanceType::L2Expanded);
+  refinement_index->allocate(handle, index_size);
 
   RAFT_CUDA_TRY(cudaMemsetAsync(list_sizes_ptr, 0, n_lists * sizeof(uint32_t), stream));
 
@@ -393,7 +399,9 @@ static const int serialization_version = 1;
  *
  */
 template <typename T, typename IdxT>
-void save(const handle_t& handle, const std::string& filename, const index<T, IdxT>& index_)
+void save(raft::device_resources const& handle,
+          const std::string& filename,
+          const index<T, IdxT>& index_)
 {
   std::ofstream of(filename, std::ios::out | std::ios::binary);
   if (!of) { RAFT_FAIL("Cannot open %s", filename.c_str()); }
@@ -434,7 +442,7 @@ void save(const handle_t& handle, const std::string& filename, const index<T, Id
  *
  */
 template <typename T, typename IdxT>
-auto load(const handle_t& handle, const std::string& filename) -> index<T, IdxT>
+auto load(raft::device_resources const& handle, const std::string& filename) -> index<T, IdxT>
 {
   std::ifstream infile(filename, std::ios::in | std::ios::binary);
 
@@ -454,7 +462,7 @@ auto load(const handle_t& handle, const std::string& filename) -> index<T, IdxT>
   index<T, IdxT> index_ =
     raft::spatial::knn::ivf_flat::index<T, IdxT>(handle, metric, n_lists, adaptive_centers, dim);
 
-  index_.allocate(handle, n_rows, metric == raft::distance::DistanceType::L2Expanded);
+  index_.allocate(handle, n_rows);
   auto data = index_.data();
   read_mdspan(handle, infile, data);
   read_mdspan(handle, infile, index_.indices());
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index d2f7d681d7..7f70d4b8a5 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,19 +18,21 @@
 
 #include "../ivf_flat_types.hpp"
 #include "ann_utils.cuh"
-#include "topk.cuh"
-#include "topk/warpsort_topk.cuh"
 
 #include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_loads_stores.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -663,9 +665,11 @@ template <int Capacity,
           typename T,
           typename AccT,
           typename IdxT,
-          typename Lambda>
+          typename Lambda,
+          typename PostLambda>
 __global__ void __launch_bounds__(kThreadsPerBlock)
   interleaved_scan_kernel(Lambda compute_dist,
+                          PostLambda post_process,
                           const uint32_t query_smem_elems,
                           const T* query,
                           const uint32_t* coarse_index,
@@ -698,8 +702,13 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
   __syncthreads();
 
-  using block_sort_t = topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, IdxT>;
-  block_sort_t queue(k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
+  using block_sort_t = matrix::detail::select::warpsort::block_sort<
+    matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    IdxT>;
+  block_sort_t queue(k);
 
   {
     using align_warp  = Pow2<WarpSize>;
@@ -776,8 +785,9 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   }
 
   // finalize and store selected neighbours
-  queue.done();
-  queue.store(distances, neighbors);
+  __syncthreads();
+  queue.done(interleaved_scan_kernel_smem);
+  queue.store(distances, neighbors, post_process);
 }
 
 /**
@@ -805,8 +815,10 @@ template <int Capacity,
           typename T,
           typename AccT,
           typename IdxT,
-          typename Lambda>
+          typename Lambda,
+          typename PostLambda>
 void launch_kernel(Lambda lambda,
+                   PostLambda post_process,
                    const ivf_flat::index<T, IdxT>& index,
                    const T* queries,
                    const uint32_t* coarse_index,
@@ -821,14 +833,16 @@ void launch_kernel(Lambda lambda,
   RAFT_EXPECTS(Veclen == index.veclen(),
                "Configured Veclen does not match the index interleaving pattern.");
   constexpr auto kKernel =
-    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda>;
+    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda, PostLambda>;
   const int max_query_smem = 16384;
   int query_smem_elems =
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
   int smem_size              = query_smem_elems * sizeof(T);
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
-  smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, IdxT>(
-    kThreadsPerBlock / kSubwarpSize, k);
+  auto block_merge_mem =
+    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
+      kThreadsPerBlock / kSubwarpSize, k);
+  smem_size += std::max<int>(smem_size, block_merge_mem);
 
   // power-of-two less than cuda limit (for better addr alignment)
   constexpr uint32_t kMaxGridY = 32768;
@@ -851,6 +865,7 @@ void launch_kernel(Lambda lambda,
       n_probes,
       smem_size);
     kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        post_process,
                                                         query_smem_elems,
                                                         queries,
                                                         coarse_index,
@@ -886,7 +901,7 @@ struct euclidean_dist<Veclen, uint8_t, uint32_t> {
       const auto diff = __vabsdiffu4(x, y);
       acc             = dp4a(diff, diff, acc);
     } else {
-      const auto diff = x - y;
+      const auto diff = __usad(x, y, 0u);
       acc += diff * diff;
     }
   }
@@ -897,8 +912,12 @@ struct euclidean_dist<Veclen, int8_t, int32_t> {
   __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
   {
     if constexpr (Veclen > 1) {
-      const auto diff = static_cast<int32_t>(__vabsdiffs4(x, y));
-      acc             = dp4a(diff, diff, acc);
+      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
+      // between two int8 numbers can be greater than 127 and therefore represented as a negative
+      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
+      // from uint8 to uint32 is correct.
+      const auto diff = __vabsdiffs4(x, y);
+      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
     } else {
       const auto diff = x - y;
       acc += diff * diff;
@@ -937,7 +956,18 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
                            T,
                            AccT,
                            IdxT,
-                           euclidean_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
     case raft::distance::DistanceType::InnerProduct:
       return launch_kernel<Capacity,
                            Veclen,
@@ -945,7 +975,8 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
                            T,
                            AccT,
                            IdxT,
-                           inner_prod_dist<Veclen, T, AccT>>({}, std::forward<Args>(args)...);
+                           inner_prod_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
     // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
     default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
   }
@@ -958,7 +989,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
 template <typename T,
           typename AccT,
           typename IdxT,
-          int Capacity = topk::kMaxCapacity,
+          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
           int Veclen   = std::max<int>(1, 16 / sizeof(T))>
 struct select_interleaved_scan_kernel {
   /**
@@ -982,12 +1013,12 @@ struct select_interleaved_scan_kernel {
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
-    // NB: this is the limitation of the topk::block_topk structures that use a huge number of
+    // NB: this is the limitation of the warpsort structures that use a huge number of
     //     registers (used in the main kernel here).
     RAFT_EXPECTS(capacity == Capacity,
                  "Capacity must be power-of-two not bigger than the maximum allowed size "
-                 "topk::kMaxCapacity (%d).",
-                 topk::kMaxCapacity);
+                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
+                 matrix::detail::select::warpsort::kMaxCapacity);
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
@@ -1013,7 +1044,7 @@ struct select_interleaved_scan_kernel {
  * @param metric type of the measured distance
  * @param n_probes number of nearest clusters to query
  * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
  * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
  * metric.
  * @param[out] neighbors device pointer to the result indices for each query and cluster
@@ -1038,7 +1069,7 @@ void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
                               uint32_t& grid_dim_x,
                               rmm::cuda_stream_view stream)
 {
-  const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
+  const int capacity = bound_by_power_of_two(k);
   select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
                                                      index.veclen(),
                                                      select_min,
@@ -1056,7 +1087,7 @@ void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
 }
 
 template <typename T, typename AccT, typename IdxT>
-void search_impl(const handle_t& handle,
+void search_impl(raft::device_resources const& handle,
                  const index<T, IdxT>& index,
                  const T* queries,
                  uint32_t n_queries,
@@ -1101,28 +1132,32 @@ void search_impl(const handle_t& handle,
   float beta  = 0.0f;
 
   // todo(lsugy): raft distance? (if performance is similar/better than gemm)
-  if (index.metric() == raft::distance::DistanceType::L2Expanded) {
-    alpha = -2.0f;
-    beta  = 1.0f;
-    raft::linalg::rowNorm(query_norm_dev.data(),
-                          converted_queries_ptr,
-                          static_cast<IdxT>(index.dim()),
-                          static_cast<IdxT>(n_queries),
-                          raft::linalg::L2Norm,
-                          true,
-                          stream,
-                          raft::sqrt_op());
-    utils::outer_add(query_norm_dev.data(),
-                     (IdxT)n_queries,
-                     index.center_norms()->data_handle(),
-                     (IdxT)index.n_lists(),
-                     distance_buffer_dev.data(),
-                     stream);
-    RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
-    RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  } else {
-    alpha = 1.0f;
-    beta  = 0.0f;
+  switch (index.metric()) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2SqrtExpanded: {
+      alpha = -2.0f;
+      beta  = 1.0f;
+      raft::linalg::rowNorm(query_norm_dev.data(),
+                            converted_queries_ptr,
+                            static_cast<IdxT>(index.dim()),
+                            static_cast<IdxT>(n_queries),
+                            raft::linalg::L2Norm,
+                            true,
+                            stream);
+      utils::outer_add(query_norm_dev.data(),
+                       (IdxT)n_queries,
+                       index.center_norms()->data_handle(),
+                       (IdxT)index.n_lists(),
+                       distance_buffer_dev.data(),
+                       stream);
+      RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
+      RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
+      break;
+    }
+    default: {
+      alpha = 1.0f;
+      beta  = 0.0f;
+    }
   }
 
   linalg::gemm(handle,
@@ -1142,16 +1177,16 @@ void search_impl(const handle_t& handle,
                stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  select_topk<AccT, uint32_t>(distance_buffer_dev.data(),
-                              nullptr,
-                              n_queries,
-                              index.n_lists(),
-                              n_probes,
-                              coarse_distances_dev.data(),
-                              coarse_indices_dev.data(),
-                              select_min,
-                              stream,
-                              search_mr);
+  matrix::detail::select_k<AccT, uint32_t>(distance_buffer_dev.data(),
+                                           nullptr,
+                                           n_queries,
+                                           index.n_lists(),
+                                           n_probes,
+                                           coarse_distances_dev.data(),
+                                           coarse_indices_dev.data(),
+                                           select_min,
+                                           stream,
+                                           search_mr);
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
 
@@ -1200,16 +1235,16 @@ void search_impl(const handle_t& handle,
 
   // Merge topk values from different blocks
   if (grid_dim_x > 1) {
-    select_topk<AccT, IdxT>(refined_distances_dev.data(),
-                            refined_indices_dev.data(),
-                            n_queries,
-                            k * grid_dim_x,
-                            k,
-                            distances,
-                            neighbors,
-                            select_min,
-                            stream,
-                            search_mr);
+    matrix::detail::select_k<AccT, IdxT>(refined_distances_dev.data(),
+                                         refined_indices_dev.data(),
+                                         n_queries,
+                                         k * grid_dim_x,
+                                         k,
+                                         distances,
+                                         neighbors,
+                                         select_min,
+                                         stream,
+                                         search_mr);
   }
 }
 
@@ -1235,7 +1270,7 @@ inline bool is_min_close(distance::DistanceType metric)
 
 /** See raft::spatial::knn::ivf_flat::search docs */
 template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
+inline void search(raft::device_resources const& handle,
                    const search_params& params,
                    const index<T, IdxT>& index,
                    const T* queries,
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
index d718deeb57..adc485d3bf 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_build.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,14 +16,14 @@
 
 #pragma once
 
-#include "ann_kmeans_balanced.cuh"
 #include "ann_serialization.h"
 #include "ann_utils.cuh"
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
+#include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
@@ -31,12 +31,15 @@
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/qr.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -48,9 +51,12 @@
 #include <thrust/binary_search.h>
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 
+#include <variant>
+
 namespace raft::spatial::knn::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
@@ -61,7 +67,9 @@ using raft::neighbors::ivf_pq::index_params;
 using raft::neighbors::ivf_pq::kIndexGroupSize;
 using raft::neighbors::ivf_pq::kIndexGroupVecLen;
 
-using pq_codes_exts = extents<size_t, dynamic_extent, dynamic_extent, kIndexGroupVecLen>;
+using pq_vec_t        = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+using pq_new_vec_exts = extents<size_t, dynamic_extent, dynamic_extent>;
+using pq_int_vec_exts = extents<size_t, dynamic_extent, dynamic_extent, kIndexGroupSize>;
 
 namespace {
 
@@ -117,80 +125,53 @@ struct bitfield_view_t {
   }
 };
 
-/*
-  NB: label type is uint32_t although it can only contain values up to `1 << pq_bits`.
-      We keep it this way to not force one more overload for kmeans::predict.
- */
-template <uint32_t PqBits, size_t VecLen>
-__device__ void ivfpq_encode_core(uint32_t n_rows,
-                                  uint32_t pq_dim,
-                                  const uint32_t* label,
-                                  uint8_t* output)
+template <uint32_t BlockDim, typename T, typename S>
+__launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
+  T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
 {
-  constexpr uint32_t kChunkSize = (VecLen * 8u) / PqBits;
-  TxN_t<uint8_t, VecLen> vec;
-  for (uint32_t j = 0; j < pq_dim;) {
-    vec.fill(0);
-    bitfield_view_t<PqBits> out{vec.val.data};
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++, label += n_rows) {
-      out[k] = static_cast<uint8_t>(*label);
-    }
-    vec.store(output, 0);
-    output += VecLen;
+  using warp    = Pow2<WarpSize>;
+  size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x));
+  uint32_t i    = warp::mod(threadIdx.x);
+  if (row_ix >= n_rows) return;
+  out += row_ix * ld_out;
+  in += row_ix * ld_in;
+  auto f = utils::mapping<T>{};
+  for (uint32_t col_ix = i; col_ix < n_cols; col_ix += warp::Value) {
+    auto x = f(in[col_ix]);
+    __syncwarp();
+    out[col_ix] = x;
   }
 }
 
-template <uint32_t BlockDim, uint32_t PqBits>
-__launch_bounds__(BlockDim) __global__
-  void ivfpq_encode_kernel(uint32_t pq_dim,
-                           const uint32_t* label,  // [pq_dim, n_rows]
-                           device_mdspan<uint8_t, pq_codes_exts, row_major> output  // [n_rows, ..]
-  )
-{
-  uint32_t i = threadIdx.x + BlockDim * blockIdx.x;
-  if (i >= output.extent(0)) return;
-  ivfpq_encode_core<PqBits, output.static_extent(2)>(
-    output.extent(0),
-    pq_dim,
-    label + i,
-    output.data_handle() + output.extent(1) * output.extent(2) * i);
-}
-}  // namespace
-
 /**
- * Compress the cluster labels into an encoding with pq_bits bits, and transform it into a form to
- * facilitate vectorized loads
+ * Copy the data one warp-per-row:
+ *
+ *  1. load the data per-warp
+ *  2. apply the `utils::mapping<T>{}`
+ *  3. sync within warp
+ *  4. store the data.
+ *
+ * Assuming sizeof(T) >= sizeof(S) and the data is properly aligned (see the usage in `build`), this
+ * allows to re-structure the data within rows in-place.
  */
-inline void ivfpq_encode(uint32_t pq_dim,
-                         uint32_t pq_bits,       // 4 <= pq_bits <= 8
-                         const uint32_t* label,  // [pq_dim, n_rows]
-                         device_mdspan<uint8_t, pq_codes_exts, row_major> output,  // [n_rows, ..]
-                         rmm::cuda_stream_view stream)
+template <typename T, typename S>
+void copy_warped(T* out,
+                 uint32_t ld_out,
+                 const S* in,
+                 uint32_t ld_in,
+                 uint32_t n_cols,
+                 size_t n_rows,
+                 rmm::cuda_stream_view stream)
 {
   constexpr uint32_t kBlockDim = 128;
   dim3 threads(kBlockDim, 1, 1);
-  dim3 blocks(raft::ceildiv<uint32_t>(output.extent(0), kBlockDim), 1, 1);
-  switch (pq_bits) {
-    case 4:
-      return ivfpq_encode_kernel<kBlockDim, 4>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 5:
-      return ivfpq_encode_kernel<kBlockDim, 5>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 6:
-      return ivfpq_encode_kernel<kBlockDim, 6>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 7:
-      return ivfpq_encode_kernel<kBlockDim, 7>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    case 8:
-      return ivfpq_encode_kernel<kBlockDim, 8>
-        <<<blocks, threads, 0, stream>>>(pq_dim, label, output);
-    default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
-  }
+  dim3 blocks(div_rounding_up_safe<size_t>(n_rows, kBlockDim / WarpSize), 1, 1);
+  copy_warped_kernel<kBlockDim, T, S>
+    <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
 
+}  // namespace
+
 /**
  * @brief Fill-in a random orthogonal transformation matrix.
  *
@@ -201,7 +182,7 @@ inline void ivfpq_encode(uint32_t pq_dim,
  * @param[out] rotation_matrix device pointer to a row-major matrix of size [n_rows, n_cols].
  * @param rng random number generator state
  */
-inline void make_rotation_matrix(const handle_t& handle,
+inline void make_rotation_matrix(raft::device_resources const& handle,
                                  bool force_random_rotation,
                                  uint32_t n_rows,
                                  uint32_t n_cols,
@@ -230,8 +211,11 @@ inline void make_rotation_matrix(const handle_t& handle,
     }
   } else {
     uint32_t stride = n + 1;
-    auto f = [stride] __device__(float* out, uint32_t i) -> void { *out = float(i % stride == 0); };
-    linalg::writeOnlyUnaryOp(rotation_matrix, n * n, f, stream);
+    auto rotation_matrix_view =
+      raft::make_device_vector_view<float, uint32_t>(rotation_matrix, n * n);
+    linalg::map_offset(handle, rotation_matrix_view, [stride] __device__(uint32_t i) {
+      return static_cast<float>(i % stride == 0u);
+    });
   }
 }
 
@@ -242,7 +226,7 @@ inline void make_rotation_matrix(const handle_t& handle,
  *
  */
 template <typename T, typename IdxT>
-void select_residuals(const handle_t& handle,
+void select_residuals(raft::device_resources const& handle,
                       float* residuals,
                       IdxT n_rows,
                       uint32_t dim,
@@ -257,8 +241,11 @@ void select_residuals(const handle_t& handle,
 {
   auto stream = handle.get_stream();
   rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
-  utils::copy_selected<float, T>(
-    n_rows, (IdxT)dim, dataset, row_ids, (IdxT)dim, tmp.data(), (IdxT)dim, stream);
+  // Note: the number of rows of the input dataset isn't actually n_rows, but matrix::gather doesn't
+  // need to know it, any strictly positive number would work.
+  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(
+    dataset, utils::mapping<float>{});
+  raft::matrix::gather(mapping_itr, (IdxT)dim, n_rows, row_ids, n_rows, tmp.data(), stream);
 
   raft::matrix::linewiseOp(
     tmp.data(), tmp.data(), IdxT(dim), n_rows, true, raft::sub_op{}, stream, center);
@@ -283,166 +270,52 @@ void select_residuals(const handle_t& handle,
 }
 
 /**
+ * @brief Compute residual vectors from the source dataset given by selected indices.
+ *
+ * The residual has the form
+ *  `rotation_matrix %* (dataset[:, :] - centers[labels[:], 0:dim])`
  *
- * @param handle,
- * @param n_rows
- * @param data_dim
- * @param rot_dim
- * @param pq_dim
- * @param pq_len
- * @param pq_bits
- * @param n_clusters
- * @param codebook_kind
- * @param max_cluster_size
- * @param cluster_centers           // [n_clusters, data_dim]
- * @param rotation_matrix     // [rot_dim, data_dim]
- * @param dataset                 // [n_rows]
- * @param data_indices
- *    tells which indices to select in the dataset for each cluster [n_rows];
- *    it should be partitioned by the clusters by now.
- * @param cluster_sizes    // [n_clusters]
- * @param cluster_offsets  // [n_clusters + 1]
- * @param pq_centers  // [...] (see ivf_pq::index::pq_centers() layout)
- * @param pq_dataset
- *   // [n_rows, ceildiv(pq_dim, (kIndexGroupVecLen * 8u) / pq_bits), kIndexGroupVecLen]
- *   NB: in contrast to the final interleaved layout in ivf_pq::index::pq_dataset(), this function
- *       produces a non-interleaved data; it gets interleaved later when adding the data to the
- *       index.
- * @param device_memory
  */
 template <typename T, typename IdxT>
-void compute_pq_codes(
-  const handle_t& handle,
+void flat_compute_residuals(
+  raft::device_resources const& handle,
+  float* residuals,  // [n_rows, rot_dim]
   IdxT n_rows,
-  uint32_t data_dim,
-  uint32_t rot_dim,
-  uint32_t pq_dim,
-  uint32_t pq_len,
-  uint32_t pq_bits,
-  uint32_t n_clusters,
-  codebook_gen codebook_kind,
-  uint32_t max_cluster_size,
-  float* cluster_centers,
-  const float* rotation_matrix,
-  const T* dataset,
-  const IdxT* data_indices,
-  const uint32_t* cluster_sizes,
-  const IdxT* cluster_offsets,
-  device_mdspan<const float, typename index<IdxT>::pq_centers_extents, row_major> pq_centers,
-  device_mdspan<uint8_t, pq_codes_exts, row_major> pq_dataset,
+  device_mdspan<const float, extent_2d<uint32_t>, row_major> rotation_matrix,  // [rot_dim, dim]
+  device_mdspan<const float, extent_2d<uint32_t>, row_major> centers,          // [n_lists, dim_ext]
+  const T* dataset,                                                            // [n_rows, dim]
+  const uint32_t* labels,                                                      // [n_rows]
   rmm::mr::device_memory_resource* device_memory)
 {
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_pq::compute_pq_codes(n_rows = %zu, data_dim = %u, rot_dim = %u (%u * %u), n_clusters = "
-    "%u)",
-    size_t(n_rows),
-    data_dim,
-    rot_dim,
-    pq_dim,
-    pq_len,
-    n_clusters);
-  auto stream = handle.get_stream();
-
-  //
-  // Compute PQ code
-  //
-
-  uint32_t pq_width = 1 << pq_bits;
-  rmm::device_uvector<float> pq_centers_tmp(pq_len * pq_width, stream, device_memory);
-  rmm::device_uvector<float> rot_vectors(
-    size_t(max_cluster_size) * size_t(rot_dim), stream, device_memory);
-  rmm::device_uvector<float> sub_vectors(
-    size_t(max_cluster_size) * size_t(pq_dim * pq_len), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_vector_labels(
-    size_t(max_cluster_size) * size_t(pq_dim), stream, device_memory);
-
-  for (uint32_t l = 0; l < n_clusters; l++) {
-    auto cluster_size = cluster_sizes[l];
-    common::nvtx::range<common::nvtx::domain::raft> cluster_scope(
-      "ivf_pq::compute_pq_codes::cluster[%u](size = %u)", l, cluster_size);
-    if (cluster_size == 0) continue;
-
-    select_residuals(handle,
-                     rot_vectors.data(),
-                     IdxT(cluster_size),
-                     data_dim,
-                     rot_dim,
-                     rotation_matrix,
-                     cluster_centers + size_t(l) * size_t(data_dim),
-                     dataset,
-                     data_indices + cluster_offsets[l],
-                     device_memory);
-
-    //
-    // Change the order of the vector data to facilitate processing in
-    // each vector subspace.
-    //   input:  rot_vectors[cluster_size, rot_dim] = [cluster_size, pq_dim, pq_len]
-    //   output: sub_vectors[pq_dim, cluster_size, pq_len]
-    //
-    for (uint32_t i = 0; i < pq_dim; i++) {
-      RAFT_CUDA_TRY(
-        cudaMemcpy2DAsync(sub_vectors.data() + size_t(i) * size_t(pq_len) * size_t(cluster_size),
-                          sizeof(float) * pq_len,
-                          rot_vectors.data() + i * pq_len,
-                          sizeof(float) * rot_dim,
-                          sizeof(float) * pq_len,
-                          cluster_size,
-                          cudaMemcpyDefault,
-                          stream));
-    }
-
-    if (codebook_kind == codebook_gen::PER_CLUSTER) {
-      linalg::writeOnlyUnaryOp(
-        pq_centers_tmp.data(),
-        pq_len * pq_width,
-        [pq_centers, pq_width, pq_len, l] __device__(float* out, uint32_t i) {
-          auto i0 = i / pq_len;
-          auto i1 = i % pq_len;
-          *out    = pq_centers(l, i1, i0);
-        },
-        stream);
-    }
-
-    //
-    // Find a label (cluster ID) for each vector subspace.
-    //
-    for (uint32_t j = 0; j < pq_dim; j++) {
-      if (codebook_kind == codebook_gen::PER_SUBSPACE) {
-        linalg::writeOnlyUnaryOp(
-          pq_centers_tmp.data(),
-          pq_len * pq_width,
-          [pq_centers, pq_width, pq_len, j] __device__(float* out, uint32_t i) {
-            auto i0 = i / pq_len;
-            auto i1 = i % pq_len;
-            *out    = pq_centers(j, i1, i0);
-          },
-          stream);
-      }
-      kmeans::predict(handle,
-                      pq_centers_tmp.data(),
-                      pq_width,
-                      pq_len,
-                      sub_vectors.data() + size_t(j) * size_t(cluster_size) * size_t(pq_len),
-                      cluster_size,
-                      sub_vector_labels.data() + size_t(j) * size_t(cluster_size),
-                      raft::distance::DistanceType::L2Expanded,
-                      stream,
-                      device_memory);
-    }
-
-    //
-    // PQ encoding
-    //
-    ivfpq_encode(
-      pq_dim,
-      pq_bits,
-      sub_vector_labels.data(),
-      make_mdspan<uint8_t, IdxT, row_major, false, true>(
-        pq_dataset.data_handle() +
-          size_t(cluster_offsets[l]) * pq_dataset.extent(1) * pq_dataset.extent(2),
-        make_extents<IdxT>(cluster_size, pq_dataset.extent(1), pq_dataset.static_extent(2))),
-      stream);
-  }
+  auto stream  = handle.get_stream();
+  auto dim     = rotation_matrix.extent(1);
+  auto rot_dim = rotation_matrix.extent(0);
+  rmm::device_uvector<float> tmp(n_rows * dim, stream, device_memory);
+  auto tmp_view = raft::make_device_vector_view<float, IdxT>(tmp.data(), tmp.size());
+  linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) {
+    auto row_ix = i / dim;
+    auto el_ix  = i % dim;
+    auto label  = labels[row_ix];
+    return utils::mapping<float>{}(dataset[i]) - centers(label, el_ix);
+  });
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+  linalg::gemm(handle,
+               true,
+               false,
+               rot_dim,
+               n_rows,
+               dim,
+               &alpha,
+               rotation_matrix.data_handle(),
+               dim,
+               tmp.data(),
+               dim,
+               &beta,
+               residuals,
+               rot_dim,
+               stream);
 }
 
 template <uint32_t BlockDim, typename IdxT>
@@ -482,7 +355,7 @@ auto calculate_offsets_and_indices(IdxT n_rows,
   IdxT cumsum = 0;
   update_device(cluster_offsets, &cumsum, 1, stream);
   thrust::inclusive_scan(
-    exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, thrust::plus<IdxT>{});
+    exec_policy, cluster_sizes, cluster_sizes + n_lists, cluster_offsets + 1, add_op{});
   update_host(&cumsum, cluster_offsets + n_lists, 1, stream);
   uint32_t max_cluster_size =
     *thrust::max_element(exec_policy, cluster_sizes, cluster_sizes + n_lists);
@@ -500,33 +373,32 @@ auto calculate_offsets_and_indices(IdxT n_rows,
 }
 
 template <typename IdxT>
-void transpose_pq_centers(index<IdxT>& index,
-                          const float* pq_centers_source,
-                          rmm::cuda_stream_view stream)
+void transpose_pq_centers(const device_resources& handle,
+                          index<IdxT>& index,
+                          const float* pq_centers_source)
 {
+  auto stream  = handle.get_stream();
   auto extents = index.pq_centers().extents();
   static_assert(extents.rank() == 3);
   auto extents_source =
     make_extents<uint32_t>(extents.extent(0), extents.extent(2), extents.extent(1));
   auto span_source =
     make_mdspan<const float, uint32_t, row_major, false, true>(pq_centers_source, extents_source);
-  linalg::writeOnlyUnaryOp(
-    index.pq_centers().data_handle(),
-    index.pq_centers().size(),
-    [span_source, extents] __device__(float* out, size_t i) {
-      uint32_t ii[3];
-      for (int r = 2; r > 0; r--) {
-        ii[r] = i % extents.extent(r);
-        i /= extents.extent(r);
-      }
-      ii[0] = i;
-      *out  = span_source(ii[0], ii[2], ii[1]);
-    },
-    stream);
+  auto pq_centers_view = raft::make_device_vector_view<float, IdxT>(
+    index.pq_centers().data_handle(), index.pq_centers().size());
+  linalg::map_offset(handle, pq_centers_view, [span_source, extents] __device__(size_t i) {
+    uint32_t ii[3];
+    for (int r = 2; r > 0; r--) {
+      ii[r] = i % extents.extent(r);
+      i /= extents.extent(r);
+    }
+    ii[0] = i;
+    return span_source(ii[0], ii[2], ii[1]);
+  });
 }
 
 template <typename IdxT>
-void train_per_subset(const handle_t& handle,
+void train_per_subset(raft::device_resources const& handle,
                       index<IdxT>& index,
                       size_t n_rows,
                       const float* trainset,   // [n_rows, dim]
@@ -578,25 +450,35 @@ void train_per_subset(const handle_t& handle,
                  index.pq_len(),
                  stream);
 
+    // clone the handle and attached the device memory resource to it
+    const device_resources new_handle(handle, device_memory);
+
     // train PQ codebook for this subspace
-    kmeans::build_clusters(handle,
-                           kmeans_n_iters,
-                           index.pq_len(),
-                           sub_trainset.data(),
-                           n_rows,
-                           index.pq_book_size(),
-                           pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
-                           sub_labels.data(),
-                           pq_cluster_sizes.data(),
-                           raft::distance::DistanceType::L2Expanded,
-                           stream,
-                           device_memory);
+    auto sub_trainset_view =
+      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
+    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+      pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
+      index.pq_book_size(),
+      index.pq_len());
+    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
+    auto cluster_sizes_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = kmeans_n_iters;
+    kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+    raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle,
+                                                            kmeans_params,
+                                                            sub_trainset_view,
+                                                            centers_tmp_view,
+                                                            sub_labels_view,
+                                                            cluster_sizes_view,
+                                                            utils::mapping<float>{});
   }
-  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
+  transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
 template <typename IdxT>
-void train_per_cluster(const handle_t& handle,
+void train_per_cluster(raft::device_resources const& handle,
                        index<IdxT>& index,
                        size_t n_rows,
                        const float* trainset,   // [n_rows, dim]
@@ -649,44 +531,427 @@ void train_per_cluster(const handle_t& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
+    // clone the handle and attached the device memory resource to it
+    const device_resources new_handle(handle, device_memory);
+
     // limit the cluster size to bound the training time.
     // [sic] we interpret the data as pq_len-dimensional
     size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
-    kmeans::build_clusters(
-      handle,
-      kmeans_n_iters,
-      index.pq_len(),
-      rot_vectors.data(),
-      pq_n_rows,
+    auto rot_vectors_view = raft::make_device_matrix_view<const float, IdxT>(
+      rot_vectors.data(), pq_n_rows, index.pq_len());
+    auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
+      pq_centers_tmp.data() + static_cast<size_t>(index.pq_book_size()) *
+                                static_cast<size_t>(index.pq_len()) * static_cast<size_t>(l),
       index.pq_book_size(),
-      pq_centers_tmp.data() + size_t(index.pq_book_size()) * size_t(index.pq_len()) * size_t(l),
-      pq_labels.data(),
-      pq_cluster_sizes.data(),
-      raft::distance::DistanceType::L2Expanded,
-      stream,
-      device_memory);
+      index.pq_len());
+    auto pq_labels_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(pq_labels.data(), pq_n_rows);
+    auto pq_cluster_sizes_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = kmeans_n_iters;
+    kmeans_params.metric  = raft::distance::DistanceType::L2Expanded;
+    raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle,
+                                                            kmeans_params,
+                                                            rot_vectors_view,
+                                                            centers_tmp_view,
+                                                            pq_labels_view,
+                                                            pq_cluster_sizes_view,
+                                                            utils::mapping<float>{});
+  }
+  transpose_pq_centers(handle, index, pq_centers_tmp.data());
+}
+
+/**
+ * Sort cluster by their size (descending).
+ *
+ * @return Number of non-empty clusters
+ */
+inline auto reorder_clusters_by_size_desc(raft::device_resources const& handle,
+                                          uint32_t* ordering,
+                                          uint32_t* cluster_sizes_out,
+                                          const uint32_t* cluster_sizes_in,
+                                          uint32_t n_clusters,
+                                          rmm::mr::device_memory_resource* device_memory)
+  -> uint32_t
+{
+  auto stream = handle.get_stream();
+  rmm::device_uvector<uint32_t> cluster_ordering_in(n_clusters, stream, device_memory);
+  thrust::sequence(handle.get_thrust_policy(),
+                   cluster_ordering_in.data(),
+                   cluster_ordering_in.data() + n_clusters);
+
+  int begin_bit             = 0;
+  int end_bit               = sizeof(uint32_t) * 8;
+  size_t cub_workspace_size = 0;
+  cub::DeviceRadixSort::SortPairsDescending(nullptr,
+                                            cub_workspace_size,
+                                            cluster_sizes_in,
+                                            cluster_sizes_out,
+                                            cluster_ordering_in.data(),
+                                            ordering,
+                                            n_clusters,
+                                            begin_bit,
+                                            end_bit,
+                                            stream);
+  rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory);
+  cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(),
+                                            cub_workspace_size,
+                                            cluster_sizes_in,
+                                            cluster_sizes_out,
+                                            cluster_ordering_in.data(),
+                                            ordering,
+                                            n_clusters,
+                                            begin_bit,
+                                            end_bit,
+                                            stream);
+
+  return thrust::lower_bound(handle.get_thrust_policy(),
+                             cluster_sizes_out,
+                             cluster_sizes_out + n_clusters,
+                             0,
+                             thrust::greater<uint32_t>()) -
+         cluster_sizes_out;
+}
+
+/**
+ * Compute the code: find the closest cluster in each pq_dim-subspace.
+ *
+ * @tparam SubWarpSize
+ *   how many threads work on a single vector;
+ *   bouded by either WarpSize or pq_book_size.
+ *
+ * @param pq_centers
+ *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
+ *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
+ * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len].
+ *   the input must be already transformed to floats, rotated, and the level 1 cluster
+ *   center must be already substructed (i.e. this is the residual of a single input vector).
+ * @param codebook_kind
+ * @param j index along pq_dim "dimension"
+ * @param cluster_ix is used for PER_CLUSTER codebooks.
+ */
+template <uint32_t SubWarpSize>
+__device__ auto compute_pq_code(
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  device_mdspan<const float, extent_2d<uint32_t>, row_major> new_vector,
+  codebook_gen codebook_kind,
+  uint32_t j,
+  uint32_t cluster_ix) -> uint8_t
+{
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t lane_id    = subwarp_align::mod(laneId());
+  uint32_t partition_ix;
+  switch (codebook_kind) {
+    case codebook_gen::PER_CLUSTER: {
+      partition_ix = cluster_ix;
+    } break;
+    case codebook_gen::PER_SUBSPACE: {
+      partition_ix = j;
+    } break;
+    default: __builtin_unreachable();
+  }
+
+  const uint32_t pq_book_size = pq_centers.extent(2);
+  const uint32_t pq_len       = pq_centers.extent(1);
+  float min_dist              = std::numeric_limits<float>::infinity();
+  uint8_t code                = 0;
+  // calculate the distance for each PQ cluster, find the minimum for each thread
+  for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) {
+    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+    float d = 0.0f;
+    for (uint32_t k = 0; k < pq_len; k++) {
+      auto t = new_vector(j, k) - pq_centers(partition_ix, k, i);
+      d += t * t;
+    }
+    if (d < min_dist) {
+      min_dist = d;
+      code     = uint8_t(i);
+    }
+  }
+  // reduce among threads
+#pragma unroll
+  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+    const auto other_code = shfl_xor(code, stride, SubWarpSize);
+    if (other_dist < min_dist) {
+      min_dist = other_dist;
+      code     = other_code;
+    }
+  }
+  return code;
+}
+
+template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
+__launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
+  device_mdspan<const float, extent_2d<IdxT>, row_major> new_vectors,
+  std::variant<IdxT, const IdxT*> src_offset_or_indices,
+  const uint32_t* new_labels,
+  device_mdspan<uint32_t, extent_1d<uint32_t>, row_major> list_sizes,
+  device_mdspan<const IdxT, extent_1d<uint32_t>, row_major> list_offsets,
+  device_mdspan<IdxT, extent_1d<IdxT>, row_major> pq_indices,
+  device_mdspan<pq_vec_t, pq_int_vec_exts, row_major> pq_dataset,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  using subwarp_align             = Pow2<kSubWarpSize>;
+  const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x});
+  if (row_ix >= new_vectors.extent(0)) { return; }
+
+  const uint32_t cluster_ix = new_labels[row_ix];
+  uint32_t out_incluster_ix;
+  if (lane_id == 0) { out_incluster_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
+  out_incluster_ix  = shfl(out_incluster_ix, 0, kSubWarpSize);
+  const IdxT out_ix = list_offsets(cluster_ix) + out_incluster_ix;
+
+  // write the label
+  if (lane_id == 0) {
+    if (std::holds_alternative<IdxT>(src_offset_or_indices)) {
+      pq_indices(out_ix) = std::get<IdxT>(src_offset_or_indices) + row_ix;
+    } else {
+      pq_indices(out_ix) = std::get<const IdxT*>(src_offset_or_indices)[row_ix];
+    }
+  }
+
+  // write the codes
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+  const uint32_t pq_len     = pq_centers.extent(1);
+  const uint32_t pq_dim     = new_vectors.extent(1) / pq_len;
+
+  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
+  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
+  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk for writing
+    if (lane_id == 0) { code = pq_vec_t{}; }
+    // fill-in the values, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // find the label
+      using layout_t   = typename decltype(new_vectors)::layout_type;
+      using accessor_t = typename decltype(new_vectors)::accessor_type;
+      auto one_vector  = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
+        &new_vectors(row_ix, 0), extent_2d<uint32_t>{pq_dim, pq_len});
+      auto l = compute_pq_code<kSubWarpSize>(pq_centers, one_vector, codebook_kind, j, cluster_ix);
+      if (lane_id == 0) { out[k] = l; }
+    }
+    // write the chunk into the dataset
+    if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; }
   }
-  transpose_pq_centers(index, pq_centers_tmp.data(), stream);
 }
 
 /**
- * See raft::spatial::knn::ivf_pq::extend docs.
+ * Assuming the index already has some data and allocated the space for more, write more data in it.
+ * There must be enough free space in `pq_dataset()` and `indices()`, as computed using
+ * `list_offsets()` and `list_sizes()`.
+ *
+ * NB: Since the pq_dataset is stored in the interleaved blocked format (see ivf_pq_types.hpp), one
+ * cannot just concatenate the old and the new codes; the positions for the codes are determined the
+ * same way as in the ivfpq_compute_similarity_kernel (see ivf_pq_search.cuh).
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param handle
+ * @param index
+ * @param[in] new_vectors
+ *    a pointer to a row-major device array [index.dim(), n_rows];
+ * @param[in] src_offset_or_indices
+ *    references for the new data:
+ *      either a starting index for the auto-indexing
+ *      or a pointer to a device array of explicit indices [n_rows];
+ * @param[in] new_labels
+ *    cluster ids (first-level quantization) - a device array [n_rows];
+ * @param n_rows
+ *    the number of records to write in.
+ * @param mr
+ *    a memory resource to use for device allocations
+ */
+template <typename T, typename IdxT>
+void process_and_fill_codes(raft::device_resources const& handle,
+                            index<IdxT>& index,
+                            const T* new_vectors,
+                            std::variant<IdxT, const IdxT*> src_offset_or_indices,
+                            const uint32_t* new_labels,
+                            IdxT n_rows,
+                            rmm::mr::device_memory_resource* mr)
+{
+  pq_int_vec_exts pq_extents = make_extents<size_t>(index.pq_dataset().extent(0),
+                                                    index.pq_dataset().extent(1),
+                                                    index.pq_dataset().static_extent(2));
+  auto pq_dataset            = make_mdspan<pq_vec_t, size_t, row_major, false, true>(
+    reinterpret_cast<pq_vec_t*>(index.pq_dataset().data_handle()), pq_extents);
+
+  auto new_vectors_residual =
+    make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
+
+  flat_compute_residuals(handle,
+                         new_vectors_residual.data_handle(),
+                         n_rows,
+                         index.rotation_matrix(),
+                         index.centers(),
+                         new_vectors,
+                         new_labels,
+                         mr);
+
+  constexpr uint32_t kBlockSize  = 256;
+  const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index.pq_book_size());
+  dim3 blocks(div_rounding_up_safe<IdxT>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return process_and_fill_codes_kernel<kBlockSize, 4, IdxT>;
+      case 5: return process_and_fill_codes_kernel<kBlockSize, 5, IdxT>;
+      case 6: return process_and_fill_codes_kernel<kBlockSize, 6, IdxT>;
+      case 7: return process_and_fill_codes_kernel<kBlockSize, 7, IdxT>;
+      case 8: return process_and_fill_codes_kernel<kBlockSize, 8, IdxT>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, handle.get_stream()>>>(new_vectors_residual.view(),
+                                                      src_offset_or_indices,
+                                                      new_labels,
+                                                      index.list_sizes(),
+                                                      index.list_offsets(),
+                                                      index.indices(),
+                                                      pq_dataset,
+                                                      index.pq_centers(),
+                                                      index.codebook_kind());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * Fill the `target` index with the data from the `source`, except `list_offsets`.
+ * The `target` index must have the same settings and valid `list_offsets`, and must have been
+ * pre-allocated to fit the whole `source` data.
+ * As a result, the `target` index is in a valid state; it's identical to the `source`, except
+ * has more unused space in `pq_dataset`.
  *
- * This version requires `new_vectors` and `new_indices` (if non-null) to be on-device.
+ * @param target the index to be filled-in
+ * @param source the index to get data from
+ * @param cluster_ordering
+ *   a pointer to the managed data [n_clusters];
+ *   the mapping `source_label = cluster_ordering[target_label]`
+ * @param stream
  */
+template <typename IdxT>
+void copy_index_data(index<IdxT>& target,
+                     const index<IdxT>& source,
+                     const uint32_t* cluster_ordering,
+                     rmm::cuda_stream_view stream)
+{
+  RAFT_EXPECTS(target.size() >= source.size(),
+               "The target index must be not smaller than the source index.");
+  RAFT_EXPECTS(target.n_lists() == source.n_lists(),
+               "The target and the source are not compatible (different numbers of clusters).");
+  RAFT_EXPECTS(target.rot_dim() == source.rot_dim() && target.dim_ext() == source.dim_ext(),
+               "The target and the source are not compatible (different dimensionality).");
+
+  // Copy the unchanged parts
+  copy(target.rotation_matrix().data_handle(),
+       source.rotation_matrix().data_handle(),
+       source.rotation_matrix().size(),
+       stream);
+
+  // copy cluster-ordering-dependent data
+  raft::matrix::gather(source.list_sizes().data_handle(),
+                       IdxT{1},
+                       static_cast<IdxT>(source.n_lists()),
+                       cluster_ordering,
+                       static_cast<IdxT>(target.n_lists()),
+                       target.list_sizes().data_handle(),
+                       stream);
+  raft::matrix::gather(source.centers().data_handle(),
+                       static_cast<IdxT>(target.dim_ext()),
+                       static_cast<IdxT>(source.n_lists()),
+                       cluster_ordering,
+                       static_cast<IdxT>(target.n_lists()),
+                       target.centers().data_handle(),
+                       stream);
+  raft::matrix::gather(source.centers_rot().data_handle(),
+                       static_cast<IdxT>(target.rot_dim()),
+                       static_cast<IdxT>(source.n_lists()),
+                       cluster_ordering,
+                       static_cast<IdxT>(target.n_lists()),
+                       target.centers_rot().data_handle(),
+                       stream);
+  switch (source.codebook_kind()) {
+    case codebook_gen::PER_SUBSPACE: {
+      copy(target.pq_centers().data_handle(),
+           source.pq_centers().data_handle(),
+           source.pq_centers().size(),
+           stream);
+    } break;
+    case codebook_gen::PER_CLUSTER: {
+      raft::matrix::gather(source.pq_centers().data_handle(),
+                           static_cast<IdxT>(source.pq_book_size() * source.pq_len()),
+                           static_cast<IdxT>(source.n_lists()),
+                           cluster_ordering,
+                           static_cast<IdxT>(target.n_lists()),
+                           target.pq_centers().data_handle(),
+                           stream);
+    } break;
+    default: RAFT_FAIL("Unreachable code");
+  }
+
+  // Fill the data with the old clusters.
+  if (source.size() > 0) {
+    std::vector<IdxT> target_cluster_offsets(target.n_lists() + 1);
+    std::vector<IdxT> source_cluster_offsets(target.n_lists() + 1);
+    std::vector<uint32_t> source_cluster_sizes(target.n_lists());
+    copy(target_cluster_offsets.data(),
+         target.list_offsets().data_handle(),
+         target.list_offsets().size(),
+         stream);
+    copy(source_cluster_offsets.data(),
+         source.list_offsets().data_handle(),
+         source.list_offsets().size(),
+         stream);
+    copy(source_cluster_sizes.data(),
+         source.list_sizes().data_handle(),
+         source.list_sizes().size(),
+         stream);
+    stream.synchronize();
+    auto data_exts = target.pq_dataset().extents();
+    auto data_unit = size_t(data_exts.extent(3)) * size_t(data_exts.extent(1));
+    auto data_mod  = size_t(data_exts.extent(2));
+    for (uint32_t l = 0; l < target.n_lists(); l++) {
+      auto k                   = cluster_ordering[l];
+      auto source_cluster_size = source_cluster_sizes[k];
+      if (source_cluster_size > 0) {
+        copy(target.indices().data_handle() + target_cluster_offsets[l],
+             source.indices().data_handle() + source_cluster_offsets[k],
+             source_cluster_size,
+             stream);
+        copy(target.pq_dataset().data_handle() + target_cluster_offsets[l] * data_unit,
+             source.pq_dataset().data_handle() + source_cluster_offsets[k] * data_unit,
+             round_up_safe<size_t>(source_cluster_size, data_mod) * data_unit,
+             stream);
+      }
+    }
+  }
+}
+
+/** See raft::spatial::knn::ivf_pq::extend docs */
 template <typename T, typename IdxT>
-inline auto extend_device(const handle_t& handle,
-                          const index<IdxT>& orig_index,
-                          const T* new_vectors,
-                          const IdxT* new_indices,
-                          IdxT n_rows) -> index<IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::extend(%zu, %u)", size_t(n_rows), orig_index.dim());
-  auto stream = handle.get_stream();
+  auto stream           = handle.get_stream();
+  const auto n_clusters = orig_index.n_lists();
 
   RAFT_EXPECTS(new_indices != nullptr || orig_index.size() == 0,
                "You must pass data indices when the index is non-empty.");
@@ -694,13 +959,6 @@ inline auto extend_device(const handle_t& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
-  switch (new_indices != nullptr ? utils::check_pointer_residency(new_vectors, new_indices)
-                                 : utils::check_pointer_residency(new_vectors)) {
-    case utils::pointer_residency::device_only:
-    case utils::pointer_residency::host_and_device: break;
-    default: RAFT_FAIL("[ivf_pq::extend_device] The added data must be available on device.");
-  }
-
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
   if (pool_guard) {
@@ -712,154 +970,138 @@ inline auto extend_device(const handle_t& handle,
   rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
     &managed_memory_upstream, 1024 * 1024);
 
-  //
-  // The cluster_centers stored in index contain data other than cluster
-  // centroids to speed up the search. Here, only the cluster centroids
-  // are extracted.
-  //
-  const auto n_clusters = orig_index.n_lists();
+  // Try to allocate an index with the same parameters and the projected new size
+  // (which can be slightly larger than index.size() + n_rows, due to padding).
+  // If this fails, the index would be too big to fit in the device anyway.
+  std::optional<index<IdxT>> placeholder_index(std::in_place_t{},
+                                               handle,
+                                               orig_index.metric(),
+                                               orig_index.codebook_kind(),
+                                               n_clusters,
+                                               orig_index.dim(),
+                                               orig_index.pq_bits(),
+                                               orig_index.pq_dim(),
+                                               orig_index.n_nonempty_lists());
+  placeholder_index->allocate(
+    handle,
+    orig_index.size() + n_rows + (kIndexGroupSize - 1) * std::min<IdxT>(n_clusters, n_rows));
+
+  // Available device memory
+  size_t free_mem, total_mem;
+  RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem));
+
+  // Decide on an approximate threshold when we'd better start saving device memory by using
+  // managed allocations for large device buffers
+  rmm::mr::device_memory_resource* labels_mr  = device_memory;
+  rmm::mr::device_memory_resource* batches_mr = device_memory;
+  if (n_rows *
+        (orig_index.dim() * sizeof(T) + orig_index.pq_dim() + sizeof(IdxT) + sizeof(uint32_t)) >
+      free_mem) {
+    labels_mr = &managed_memory;
+  }
+  // Allocate a buffer for the new labels (classifying the new data)
+  rmm::device_uvector<uint32_t> new_data_labels(n_rows, stream, labels_mr);
+  if (labels_mr == device_memory) { free_mem -= sizeof(uint32_t) * n_rows; }
 
-  rmm::device_uvector<float> cluster_centers(
-    size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory);
-  RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(),
-                                  sizeof(float) * orig_index.dim(),
-                                  orig_index.centers().data_handle(),
-                                  sizeof(float) * orig_index.dim_ext(),
-                                  sizeof(float) * orig_index.dim(),
-                                  n_clusters,
-                                  cudaMemcpyDefault,
-                                  stream));
-
-  //
-  // Use the existing cluster centroids to find the label (cluster ID)
-  // of the vector to be added.
-  //
-
-  rmm::device_uvector<uint32_t> new_data_labels(n_rows, stream, device_memory);
-  utils::memzero(new_data_labels.data(), n_rows, stream);
-  rmm::device_uvector<uint32_t> new_cluster_sizes_buf(n_clusters, stream, &managed_memory);
-  auto new_cluster_sizes = new_cluster_sizes_buf.data();
-  utils::memzero(new_cluster_sizes, n_clusters, stream);
-
-  kmeans::predict(handle,
-                  cluster_centers.data(),
-                  n_clusters,
-                  orig_index.dim(),
-                  new_vectors,
-                  n_rows,
-                  new_data_labels.data(),
-                  orig_index.metric(),
-                  stream);
-  raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
-                                         reinterpret_cast<int32_t*>(new_cluster_sizes),
-                                         IdxT(n_clusters),
-                                         new_data_labels.data(),
-                                         n_rows,
-                                         1,
-                                         stream);
-
-  //
-  // Make new_cluster_offsets, new_data_indices
-  //
-  rmm::device_uvector<IdxT> new_data_indices(n_rows, stream, &managed_memory);
-  rmm::device_uvector<IdxT> new_cluster_offsets(n_clusters + 1, stream, &managed_memory);
-  uint32_t new_max_cluster_size = calculate_offsets_and_indices(n_rows,
-                                                                n_clusters,
-                                                                new_data_labels.data(),
-                                                                new_cluster_sizes,
-                                                                new_cluster_offsets.data(),
-                                                                new_data_indices.data(),
-                                                                stream);
-
-  //
-  // Compute PQ code for new vectors
-  //
-  pq_codes_exts new_pq_exts = make_extents<size_t>(
-    n_rows, orig_index.pq_dataset().extent(1), orig_index.pq_dataset().static_extent(3));
-  auto new_pq_codes = make_device_mdarray<uint8_t>(handle, device_memory, new_pq_exts);
-  compute_pq_codes<T>(handle,
-                      n_rows,
-                      orig_index.dim(),
-                      orig_index.rot_dim(),
-                      orig_index.pq_dim(),
-                      orig_index.pq_len(),
-                      orig_index.pq_bits(),
-                      n_clusters,
-                      orig_index.codebook_kind(),
-                      new_max_cluster_size,
-                      cluster_centers.data(),
-                      orig_index.rotation_matrix().data_handle(),
-                      new_vectors,
-                      new_data_indices.data(),
-                      new_cluster_sizes,
-                      new_cluster_offsets.data(),
-                      orig_index.pq_centers(),
-                      new_pq_codes.view(),
-                      device_memory);
+  // Calculate the batch size for the input data if it's not accessible directly from the device
+  constexpr size_t kReasonableMaxBatchSize = 65536;
+  size_t max_batch_size                    = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
+  {
+    size_t size_factor = 0;
+    // we'll use two temporary buffers for converted inputs when computing the codes.
+    size_factor += (orig_index.dim() + orig_index.rot_dim()) * sizeof(float);
+    // ...and another buffer for indices
+    size_factor += sizeof(IdxT);
+    // if the input data is not accessible on device, we'd need a buffer for it.
+    switch (utils::check_pointer_residency(new_vectors)) {
+      case utils::pointer_residency::device_only:
+      case utils::pointer_residency::host_and_device: break;
+      default: size_factor += orig_index.dim() * sizeof(T);
+    }
+    // the same with indices
+    if (new_indices != nullptr) {
+      switch (utils::check_pointer_residency(new_indices)) {
+        case utils::pointer_residency::device_only:
+        case utils::pointer_residency::host_and_device: break;
+        default: size_factor += sizeof(IdxT);
+      }
+    }
+    // make the batch size fit into the remaining memory
+    while (size_factor * max_batch_size > free_mem && max_batch_size > 128) {
+      max_batch_size >>= 1;
+    }
+    if (size_factor * max_batch_size > free_mem) {
+      // if that still doesn't fit, resort to the UVM
+      batches_mr     = &managed_memory;
+      max_batch_size = kReasonableMaxBatchSize;
+    } else {
+      // If we're keeping the batches in device memory, update the available mem tracker.
+      free_mem -= size_factor * max_batch_size;
+    }
+  }
+
+  // Predict the cluster labels for the new data, in batches if necessary
+  utils::batch_load_iterator<T> vec_batches(
+    new_vectors, n_rows, orig_index.dim(), max_batch_size, stream, batches_mr);
+  // Release the placeholder memory, because we don't intend to allocate any more long-living
+  // temporary buffers before we allocate the ext_index data.
+  // This memory could potentially speed up UVM accesses, if any.
+  placeholder_index.reset();
+  {
+    // The cluster centers in the index are stored padded, which is not acceptable by
+    // the kmeans_balanced::predict. Thus, we need the restructuring copy.
+    rmm::device_uvector<float> cluster_centers(
+      size_t(n_clusters) * size_t(orig_index.dim()), stream, device_memory);
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(cluster_centers.data(),
+                                    sizeof(float) * orig_index.dim(),
+                                    orig_index.centers().data_handle(),
+                                    sizeof(float) * orig_index.dim_ext(),
+                                    sizeof(float) * orig_index.dim(),
+                                    n_clusters,
+                                    cudaMemcpyDefault,
+                                    stream));
+    for (const auto& batch : vec_batches) {
+      auto batch_data_view =
+        raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), orig_index.dim());
+      auto batch_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(
+        new_data_labels.data() + batch.offset(), batch.size());
+      auto centers_view = raft::make_device_matrix_view<const float, IdxT>(
+        cluster_centers.data(), n_clusters, orig_index.dim());
+      raft::cluster::kmeans_balanced_params kmeans_params;
+      kmeans_params.metric = orig_index.metric();
+      raft::cluster::kmeans_balanced::predict(handle,
+                                              kmeans_params,
+                                              batch_data_view,
+                                              centers_view,
+                                              batch_labels_view,
+                                              utils::mapping<float>{});
+    }
+  }
 
   // Get the combined cluster sizes and sort the clusters in decreasing order
   // (this makes it easy to estimate the max number of samples during search).
-  rmm::device_uvector<uint32_t> old_cluster_sizes_buf(n_clusters, stream, &managed_memory);
-  rmm::device_uvector<uint32_t> ext_cluster_sizes_buf(n_clusters, stream, &managed_memory);
-  rmm::device_uvector<IdxT> old_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
-  rmm::device_uvector<IdxT> ext_cluster_offsets_buf(n_clusters + 1, stream, &managed_memory);
   rmm::device_uvector<uint32_t> cluster_ordering_buf(n_clusters, stream, &managed_memory);
-  auto old_cluster_sizes   = old_cluster_sizes_buf.data();
-  auto ext_cluster_sizes   = ext_cluster_sizes_buf.data();
-  auto old_cluster_offsets = old_cluster_offsets_buf.data();
-  auto ext_cluster_offsets = ext_cluster_offsets_buf.data();
-  auto cluster_ordering    = cluster_ordering_buf.data();
-  copy(old_cluster_offsets,
-       orig_index.list_offsets().data_handle(),
-       orig_index.list_offsets().size(),
-       stream);
-  copy(old_cluster_sizes,
-       orig_index.list_sizes().data_handle(),
-       orig_index.list_sizes().size(),
-       stream);
-
+  rmm::device_uvector<uint32_t> ext_cluster_sizes_buf(n_clusters, stream, device_memory);
+  auto cluster_ordering     = cluster_ordering_buf.data();
+  auto ext_cluster_sizes    = ext_cluster_sizes_buf.data();
   uint32_t n_nonempty_lists = 0;
   {
-    rmm::device_uvector<uint32_t> ext_cluster_sizes_buf_in(n_clusters, stream, device_memory);
-    rmm::device_uvector<uint32_t> cluster_ordering_in(n_clusters, stream, device_memory);
-    auto ext_cluster_sizes_in = ext_cluster_sizes_buf_in.data();
-    linalg::add(ext_cluster_sizes_in, old_cluster_sizes, new_cluster_sizes, n_clusters, stream);
-
-    thrust::sequence(handle.get_thrust_policy(),
-                     cluster_ordering_in.data(),
-                     cluster_ordering_in.data() + n_clusters);
-
-    int begin_bit             = 0;
-    int end_bit               = sizeof(uint32_t) * 8;
-    size_t cub_workspace_size = 0;
-    cub::DeviceRadixSort::SortPairsDescending(nullptr,
-                                              cub_workspace_size,
-                                              ext_cluster_sizes_in,
-                                              ext_cluster_sizes,
-                                              cluster_ordering_in.data(),
-                                              cluster_ordering,
-                                              n_clusters,
-                                              begin_bit,
-                                              end_bit,
-                                              stream);
-    rmm::device_buffer cub_workspace(cub_workspace_size, stream, device_memory);
-    cub::DeviceRadixSort::SortPairsDescending(cub_workspace.data(),
-                                              cub_workspace_size,
-                                              ext_cluster_sizes_in,
-                                              ext_cluster_sizes,
-                                              cluster_ordering_in.data(),
-                                              cluster_ordering,
-                                              n_clusters,
-                                              begin_bit,
-                                              end_bit,
-                                              stream);
-
-    n_nonempty_lists = thrust::lower_bound(handle.get_thrust_policy(),
-                                           ext_cluster_sizes,
-                                           ext_cluster_sizes + n_clusters,
-                                           0,
-                                           thrust::greater<uint32_t>()) -
-                       ext_cluster_sizes;
+    rmm::device_uvector<uint32_t> new_cluster_sizes_buf(n_clusters, stream, device_memory);
+    auto new_cluster_sizes = new_cluster_sizes_buf.data();
+    raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(new_cluster_sizes),
+                                           IdxT(n_clusters),
+                                           new_data_labels.data(),
+                                           n_rows,
+                                           1,
+                                           stream);
+    linalg::add(new_cluster_sizes,
+                new_cluster_sizes,
+                orig_index.list_sizes().data_handle(),
+                n_clusters,
+                stream);
+    n_nonempty_lists = reorder_clusters_by_size_desc(
+      handle, cluster_ordering, ext_cluster_sizes, new_cluster_sizes, n_clusters, device_memory);
   }
 
   // Assemble the extended index
@@ -871,195 +1113,70 @@ inline auto extend_device(const handle_t& handle,
                         orig_index.pq_bits(),
                         orig_index.pq_dim(),
                         n_nonempty_lists);
-  // calculate extended cluster offsets
+  // calculate extended cluster offsets and allocate the index data
   {
-    using group_align = Pow2<kIndexGroupSize>;
-    IdxT size         = 0;
+    auto ext_cluster_offsets = ext_index.list_offsets().data_handle();
+    using group_align        = Pow2<kIndexGroupSize>;
+    IdxT size                = 0;
     update_device(ext_cluster_offsets, &size, 1, stream);
-    thrust::inclusive_scan(
-      handle.get_thrust_policy(),
-      ext_cluster_sizes,
-      ext_cluster_sizes + n_clusters,
-      ext_cluster_offsets + 1,
-      [] __device__(IdxT a, IdxT b) { return group_align::roundUp(a) + group_align::roundUp(b); });
+    auto sizes_padded = thrust::make_transform_iterator(
+      ext_cluster_sizes, [] __device__ __host__(uint32_t x) -> IdxT {
+        return IdxT{Pow2<kIndexGroupSize>::roundUp(x)};
+      });
+    thrust::inclusive_scan(handle.get_thrust_policy(),
+                           sizes_padded,
+                           sizes_padded + n_clusters,
+                           ext_cluster_offsets + 1,
+                           add_op{});
     update_host(&size, ext_cluster_offsets + n_clusters, 1, stream);
-    handle.sync_stream();
-    copy(ext_index.list_offsets().data_handle(),
-         ext_cluster_offsets,
-         ext_index.list_offsets().size(),
-         stream);
-    copy(ext_index.list_sizes().data_handle(),
-         ext_cluster_sizes,
-         ext_index.list_sizes().size(),
-         stream);
+    handle.sync_stream();  // syncs `size`, `cluster_ordering`
     ext_index.allocate(handle, size);
   }
 
-  // Copy the unchanged parts
-  copy(ext_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().data_handle(),
-       orig_index.rotation_matrix().size(),
-       stream);
-
-  // copy cluster-ordering-dependent data
-  utils::copy_selected(n_clusters,
-                       ext_index.dim_ext(),
-                       orig_index.centers().data_handle(),
-                       cluster_ordering,
-                       orig_index.dim_ext(),
-                       ext_index.centers().data_handle(),
-                       ext_index.dim_ext(),
-                       stream);
-  utils::copy_selected(n_clusters,
-                       ext_index.rot_dim(),
-                       orig_index.centers_rot().data_handle(),
-                       cluster_ordering,
-                       orig_index.rot_dim(),
-                       ext_index.centers_rot().data_handle(),
-                       ext_index.rot_dim(),
-                       stream);
-  switch (orig_index.codebook_kind()) {
-    case codebook_gen::PER_SUBSPACE: {
-      copy(ext_index.pq_centers().data_handle(),
-           orig_index.pq_centers().data_handle(),
-           orig_index.pq_centers().size(),
-           stream);
-    } break;
-    case codebook_gen::PER_CLUSTER: {
-      auto d = orig_index.pq_book_size() * orig_index.pq_len();
-      utils::copy_selected(n_clusters,
-                           d,
-                           orig_index.pq_centers().data_handle(),
-                           cluster_ordering,
-                           d,
-                           ext_index.pq_centers().data_handle(),
-                           d,
-                           stream);
-    } break;
-    default: RAFT_FAIL("Unreachable code");
-  }
+  // pre-fill the extended index with the data from the original index
+  copy_index_data(ext_index, orig_index, cluster_ordering, stream);
 
-  // Make ext_indices
-  handle.sync_stream();  // make sure cluster sizes are up-to-date
-  auto ext_indices = ext_index.indices().data_handle();
-  for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    auto k                = cluster_ordering[l];
-    auto old_cluster_size = old_cluster_sizes[k];
-    auto new_cluster_size = new_cluster_sizes[k];
-    if (old_cluster_size > 0) {
-      copy(ext_indices + ext_cluster_offsets[l],
-           orig_index.indices().data_handle() + old_cluster_offsets[k],
-           old_cluster_size,
-           stream);
-    }
-    if (new_cluster_size > 0) {
-      if (new_indices == nullptr) {
-        // implies the orig index is empty
-        copy(ext_indices + ext_cluster_offsets[l] + old_cluster_size,
-             new_data_indices.data() + new_cluster_offsets.data()[k],
-             new_cluster_size,
-             stream);
-      } else {
-        utils::copy_selected((IdxT)new_cluster_size,
-                             (IdxT)1,
-                             new_indices,
-                             new_data_indices.data() + new_cluster_offsets.data()[k],
-                             (IdxT)1,
-                             ext_indices + ext_cluster_offsets[l] + old_cluster_size,
-                             (IdxT)1,
-                             stream);
-      }
+  // update the labels to correspond to the new cluster ordering
+  {
+    rmm::device_uvector<uint32_t> cluster_ordering_rev_buf(n_clusters, stream, &managed_memory);
+    auto cluster_ordering_rev = cluster_ordering_rev_buf.data();
+    for (uint32_t i = 0; i < n_clusters; i++) {
+      cluster_ordering_rev[cluster_ordering[i]] = i;
     }
+    linalg::unaryOp(
+      new_data_labels.data(),
+      new_data_labels.data(),
+      new_data_labels.size(),
+      [cluster_ordering_rev] __device__(uint32_t i) { return cluster_ordering_rev[i]; },
+      stream);
   }
 
-  /* Extend the pq_dataset */
-  // For simplicity and performance, we reinterpret the last dimension of the dataset
-  // as a single vector element.
-  using vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
-
-  auto data_unit      = ext_index.pq_dataset().extent(1);
-  auto ext_pq_dataset = make_mdspan<vec_t, size_t, row_major, false, true>(
-    reinterpret_cast<vec_t*>(ext_index.pq_dataset().data_handle()),
-    make_extents<size_t>(
-      ext_index.pq_dataset().extent(0), data_unit, ext_index.pq_dataset().extent(2)));
-
-  for (uint32_t l = 0; l < ext_index.n_lists(); l++) {
-    // Extend the data cluster-by-cluster;
-    // The original/old index stores the data interleaved;
-    // the new data produced by `compute_pq_codes` is not interleaved.
-    auto k                = cluster_ordering[l];
-    auto old_cluster_size = old_cluster_sizes[k];
-    auto old_pq_dataset   = make_mdspan<const vec_t, size_t, row_major, false, true>(
-      reinterpret_cast<const vec_t*>(orig_index.pq_dataset().data_handle()) +
-        data_unit * old_cluster_offsets[k],
-      make_extents<size_t>(div_rounding_up_safe(old_cluster_size, kIndexGroupSize),
-                           data_unit,
-                           ext_pq_dataset.extent(2)));
-    auto new_pq_data = make_mdspan<vec_t, size_t, row_major, false, true>(
-      reinterpret_cast<vec_t*>(new_pq_codes.data_handle()) +
-        data_unit * new_cluster_offsets.data()[k],
-      make_extents<size_t>(new_cluster_sizes[k], data_unit));
-    // Write all cluster data, vec-by-vec
-    linalg::writeOnlyUnaryOp(
-      ext_pq_dataset.data_handle() + data_unit * ext_cluster_offsets[l],
-      data_unit * size_t(ext_cluster_offsets[l + 1] - ext_cluster_offsets[l]),
-      [old_pq_dataset, new_pq_data, old_cluster_size] __device__(vec_t * out, size_t i_flat) {
-        // find the proper 3D index from the flat offset
-        size_t i[3];
-        for (int r = 2; r > 0; r--) {
-          i[r] = i_flat % old_pq_dataset.extent(r);
-          i_flat /= old_pq_dataset.extent(r);
-        }
-        i[0]        = i_flat;
-        auto row_ix = i[0] * old_pq_dataset.extent(2) + i[2];
-        if (row_ix < old_cluster_size) {
-          // First, pack the original/old data
-          *out = old_pq_dataset(i[0], i[1], i[2]);
-        } else {
-          // Then add the new data
-          row_ix -= old_cluster_size;
-          if (row_ix < new_pq_data.extent(0)) {
-            *out = new_pq_data(row_ix, i[1]);
-          } else {
-            *out = vec_t{};
-          }
-        }
-      },
-      stream);
+  // fill the extended index with the new data (possibly, in batches)
+  utils::batch_load_iterator<IdxT> idx_batches(
+    new_indices, n_rows, 1, max_batch_size, stream, batches_mr);
+  for (const auto& vec_batch : vec_batches) {
+    const auto& idx_batch = *idx_batches++;
+    process_and_fill_codes(handle,
+                           ext_index,
+                           vec_batch.data(),
+                           new_indices != nullptr
+                             ? std::variant<IdxT, const IdxT*>(idx_batch.data())
+                             : std::variant<IdxT, const IdxT*>(IdxT(idx_batch.offset())),
+                           new_data_labels.data() + vec_batch.offset(),
+                           IdxT(vec_batch.size()),
+                           batches_mr);
   }
 
   return ext_index;
 }
 
-/** See raft::spatial::knn::ivf_pq::extend docs */
-template <typename T, typename IdxT>
-inline auto extend(const handle_t& handle,
-                   const index<IdxT>& orig_index,
-                   const T* new_vectors,
-                   const IdxT* new_indices,
-                   IdxT n_rows) -> index<IdxT>
-{
-  size_t vec_size = sizeof(T) * size_t(n_rows) * size_t(orig_index.dim());
-  size_t ind_size = sizeof(IdxT) * size_t(n_rows);
-  return utils::with_mapped_memory_t{
-    new_vectors, vec_size, [&](const T* new_vectors_dev) {
-      return utils::with_mapped_memory_t{
-        new_indices, ind_size, [&](const IdxT* new_indices_dev) {
-          return extend_device<T, IdxT>(
-            handle, orig_index, new_vectors_dev, new_indices_dev, n_rows);
-        }}();
-    }}();
-}
-
-/**
- * See raft::spatial::knn::ivf_pq::build docs.
- *
- * This version requires `dataset` to be on-device.
- */
+/** See raft::spatial::knn::ivf_pq::build docs */
 template <typename T, typename IdxT>
-inline auto build_device(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<IdxT>
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::build(%zu, %u)", size_t(n_rows), dim);
@@ -1068,12 +1185,6 @@ inline auto build_device(
 
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
 
-  switch (utils::check_pointer_residency(dataset)) {
-    case utils::pointer_residency::device_only:
-    case utils::pointer_residency::host_and_device: break;
-    default: RAFT_FAIL("[ivf_pq::build_device] The dataset pointer must be available on device.");
-  }
-
   auto stream = handle.get_stream();
 
   index<IdxT> index(handle, params, dim);
@@ -1122,15 +1233,43 @@ inline auto build_device(
                                     cudaMemcpyDefault,
                                     stream));
   } else {
-    auto dim = index.dim();
-    linalg::writeOnlyUnaryOp(
-      trainset.data(),
-      size_t(index.dim()) * n_rows_train,
-      [dataset, trainset_ratio, dim] __device__(float* out, size_t i) {
+    size_t dim = index.dim();
+    cudaPointerAttributes dataset_attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&dataset_attr, dataset));
+    if (dataset_attr.devicePointer != nullptr) {
+      // data is available on device: just run the kernel to copy and map the data
+      auto p = reinterpret_cast<T*>(dataset_attr.devicePointer);
+      auto trainset_view =
+        raft::make_device_vector_view<float, IdxT>(trainset.data(), dim * n_rows_train);
+      linalg::map_offset(handle, trainset_view, [p, trainset_ratio, dim] __device__(size_t i) {
         auto col = i % dim;
-        *out     = utils::mapping<float>{}(dataset[(i - col) * size_t(trainset_ratio) + col]);
-      },
-      stream);
+        return utils::mapping<float>{}(p[(i - col) * size_t(trainset_ratio) + col]);
+      });
+    } else {
+      // data is not available: first copy, then map inplace
+      auto trainset_tmp = reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(trainset.data()) +
+                                               (sizeof(float) - sizeof(T)) * index.dim());
+      // We copy the data in strides, one row at a time, and place the smaller rows of type T
+      // at the end of float rows.
+      RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset_tmp,
+                                      sizeof(float) * index.dim(),
+                                      dataset,
+                                      sizeof(T) * index.dim() * trainset_ratio,
+                                      sizeof(T) * index.dim(),
+                                      n_rows_train,
+                                      cudaMemcpyDefault,
+                                      stream));
+      // Transform the input `{T -> float}`, one row per warp.
+      // The threads in each warp copy the data synchronously; this and the layout of the data
+      // (content is aligned to the end of the rows) together allow doing the transform in-place.
+      copy_warped(trainset.data(),
+                  index.dim(),
+                  trainset_tmp,
+                  index.dim() * sizeof(float) / sizeof(T),
+                  index.dim(),
+                  n_rows_train,
+                  stream);
+    }
   }
 
   // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters, dim_ext]!
@@ -1139,28 +1278,27 @@ inline auto build_device(
   auto cluster_centers = cluster_centers_buf.data();
 
   // Train balanced hierarchical kmeans clustering
-  kmeans::build_hierarchical(handle,
-                             params.kmeans_n_iters,
-                             index.dim(),
-                             trainset.data(),
-                             n_rows_train,
-                             cluster_centers,
-                             index.n_lists(),
-                             index.metric(),
-                             stream);
+  auto trainset_const_view =
+    raft::make_device_matrix_view<const float, IdxT>(trainset.data(), n_rows_train, index.dim());
+  auto centers_view =
+    raft::make_device_matrix_view<float, IdxT>(cluster_centers, index.n_lists(), index.dim());
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.n_iters = params.kmeans_n_iters;
+  kmeans_params.metric  = index.metric();
+  raft::cluster::kmeans_balanced::fit(
+    handle, kmeans_params, trainset_const_view, centers_view, utils::mapping<float>{});
 
   // Trainset labels are needed for training PQ codebooks
   rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
-  kmeans::predict(handle,
-                  cluster_centers,
-                  index.n_lists(),
-                  index.dim(),
-                  trainset.data(),
-                  n_rows_train,
-                  labels.data(),
-                  index.metric(),
-                  stream,
-                  device_memory);
+  auto centers_const_view =
+    raft::make_device_matrix_view<const float, IdxT>(cluster_centers, index.n_lists(), index.dim());
+  auto labels_view = raft::make_device_vector_view<uint32_t, IdxT>(labels.data(), n_rows_train);
+  raft::cluster::kmeans_balanced::predict(handle,
+                                          kmeans_params,
+                                          trainset_const_view,
+                                          centers_const_view,
+                                          labels_view,
+                                          utils::mapping<float>());
 
   {
     // combine cluster_centers and their norms
@@ -1180,8 +1318,7 @@ inline auto build_device(
                           index.n_lists(),
                           raft::linalg::L2Norm,
                           true,
-                          stream,
-                          raft::sqrt_op());
+                          stream);
     RAFT_CUDA_TRY(cudaMemcpy2DAsync(index.centers().data_handle() + index.dim(),
                                     sizeof(float) * index.dim_ext(),
                                     center_norms.data(),
@@ -1245,25 +1382,12 @@ inline auto build_device(
 
   // add the data if necessary
   if (params.add_data_on_build) {
-    return detail::extend_device<T, IdxT>(handle, index, dataset, nullptr, n_rows);
+    return detail::extend<T, IdxT>(handle, index, dataset, nullptr, n_rows);
   } else {
     return index;
   }
 }
 
-/** See raft::spatial::knn::ivf_pq::build docs */
-template <typename T, typename IdxT>
-inline auto build(
-  const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim)
-  -> index<IdxT>
-{
-  size_t data_size = sizeof(T) * size_t(n_rows) * size_t(dim);
-  return utils::with_mapped_memory_t{dataset, data_size, [&](const T* dataset_dev) {
-                                       return build_device<T, IdxT>(
-                                         handle, params, dataset_dev, n_rows, dim);
-                                     }}();
-}
-
 static const int serialization_version = 1;
 
 /**
@@ -1277,7 +1401,9 @@ static const int serialization_version = 1;
  *
  */
 template <typename IdxT>
-void save(const handle_t& handle_, const std::string& filename, const index<IdxT>& index_)
+void save(raft::device_resources const& handle_,
+          const std::string& filename,
+          const index<IdxT>& index_)
 {
   std::ofstream of(filename, std::ios::out | std::ios::binary);
   if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
@@ -1324,7 +1450,7 @@ void save(const handle_t& handle_, const std::string& filename, const index<IdxT
  *
  */
 template <typename IdxT>
-auto load(const handle_t& handle_, const std::string& filename) -> index<IdxT>
+auto load(raft::device_resources const& handle_, const std::string& filename) -> index<IdxT>
 {
   std::ifstream infile(filename, std::ios::in | std::ios::binary);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 1df5671be2..c3326f8fac 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,19 +17,21 @@
 #pragma once
 
 #include "ann_utils.cuh"
-#include "topk.cuh"
-#include "topk/warpsort_topk.cuh"
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/device_loads_stores.cuh>
@@ -130,11 +132,11 @@ struct fp_8bit {
  * Select the clusters to probe and, as a side-effect, translate the queries type `T -> float`
  *
  * Assuming the number of clusters is not that big (a few thousands), we do a plain GEMM
- * followed by select_topk to select the clusters to probe. There's no need to return the similarity
+ * followed by select_k to select the clusters to probe. There's no need to return the similarity
  * scores here.
  */
 template <typename T>
-void select_clusters(const handle_t& handle,
+void select_clusters(raft::device_resources const& handle,
                      uint32_t* clusters_to_probe,  // [n_queries, n_probes]
                      float* float_queries,         // [n_queries, dim_ext]
                      uint32_t n_queries,
@@ -148,7 +150,6 @@ void select_clusters(const handle_t& handle,
                      rmm::mr::device_memory_resource* mr)
 {
   auto stream = handle.get_stream();
-  rmm::device_uvector<float> qc_distances(n_queries * n_lists, stream, mr);
   /* NOTE[qc_distances]
 
   We compute query-center distances to choose the clusters to probe.
@@ -158,37 +159,38 @@ void select_clusters(const handle_t& handle,
       cluster_centers[i, dim()] contains the squared norm of the center vector i;
       we extend the dimension K of the GEMM to compute it together with all the dot products:
 
-      `cq_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])`
+      `qc_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])`
 
       This is a monotonous mapping of the proper L2 distance.
 
     IP distance:
-      `cq_distances[i, j] = - (queries[i], cluster_centers[j])`
+      `qc_distances[i, j] = - (queries[i], cluster_centers[j])`
 
       This is a negative inner-product distance. We minimize it to find the similar clusters.
 
-      NB: cq_distances is NOT used further in ivfpq_search.
+      NB: qc_distances is NOT used further in ivfpq_search.
  */
   float norm_factor;
   switch (metric) {
+    case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded: norm_factor = 1.0 / -2.0; break;
     case raft::distance::DistanceType::InnerProduct: norm_factor = 0.0; break;
     default: RAFT_FAIL("Unsupported distance type %d.", int(metric));
   }
-  linalg::writeOnlyUnaryOp(
-    float_queries,
-    dim_ext * n_queries,
-    [queries, dim, dim_ext, norm_factor] __device__(float* out, uint32_t ix) {
+  auto float_queries_view =
+    raft::make_device_vector_view<float, uint32_t>(float_queries, dim_ext * n_queries);
+  linalg::map_offset(
+    handle, float_queries_view, [queries, dim, dim_ext, norm_factor] __device__(uint32_t ix) {
       uint32_t col = ix % dim_ext;
       uint32_t row = ix / dim_ext;
-      *out         = col < dim ? utils::mapping<float>{}(queries[col + dim * row]) : norm_factor;
-    },
-    stream);
+      return col < dim ? utils::mapping<float>{}(queries[col + dim * row]) : norm_factor;
+    });
 
   float alpha;
   float beta;
   uint32_t gemm_k = dim;
   switch (metric) {
+    case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded: {
       alpha  = -2.0;
       beta   = 0.0;
@@ -201,6 +203,7 @@ void select_clusters(const handle_t& handle,
     } break;
     default: RAFT_FAIL("Unsupported distance type %d.", int(metric));
   }
+  rmm::device_uvector<float> qc_distances(n_queries * n_lists, stream, mr);
   linalg::gemm(handle,
                true,
                false,
@@ -219,16 +222,16 @@ void select_clusters(const handle_t& handle,
 
   // Select neighbor clusters for each query.
   rmm::device_uvector<float> cluster_dists(n_queries * n_probes, stream, mr);
-  select_topk<float, uint32_t>(qc_distances.data(),
-                               nullptr,
-                               n_queries,
-                               n_lists,
-                               n_probes,
-                               cluster_dists.data(),
-                               clusters_to_probe,
-                               true,
-                               stream,
-                               mr);
+  matrix::detail::select_k<float, uint32_t>(qc_distances.data(),
+                                            nullptr,
+                                            n_queries,
+                                            n_lists,
+                                            n_probes,
+                                            cluster_dists.data(),
+                                            clusters_to_probe,
+                                            true,
+                                            stream,
+                                            mr);
 }
 
 /**
@@ -450,14 +453,15 @@ void postprocess_distances(float* out,        // [n_queries, topk]
 
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = topk::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
   template <typename... Args>
-  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf, Args...){};
+  __device__ dummy_block_sort_t(int k, Args...){};
 };
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = topk::block_sort<topk::warp_sort_distributed, Capacity, true, T, IdxT>;
+  using type = matrix::detail::select::warpsort::
+    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
 };
 
 template <typename T, typename IdxT>
@@ -710,6 +714,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     if constexpr (PrecompBaseDiff) {
       // Reduce number of memory reads later by pre-computing parts of the score
       switch (metric) {
+        case distance::DistanceType::L2SqrtExpanded:
         case distance::DistanceType::L2Expanded: {
           for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
             base_diff[i] = query[i] - cluster_center[i];
@@ -743,6 +748,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
           float pq_c = *cur_pq_center;
           cur_pq_center += PqShift;
           switch (metric) {
+            case distance::DistanceType::L2SqrtExpanded:
             case distance::DistanceType::L2Expanded: {
               float diff;
               if constexpr (PrecompBaseDiff) {
@@ -804,11 +810,12 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     constexpr OutT kDummy = upper_bound<OutT>();
     OutT query_kth        = kDummy;
     if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
-    local_topk_t block_topk(topk, smem_buf, query_kth);
+    local_topk_t block_topk(topk, nullptr, query_kth);
     OutT early_stop_limit = kDummy;
     switch (metric) {
       // If the metric is non-negative, we can use the query_kth approximation as an early stop
       // threshold to skip some iterations when computing the score. Add such metrics here.
+      case distance::DistanceType::L2SqrtExpanded:
       case distance::DistanceType::L2Expanded: {
         early_stop_limit = query_kth;
       } break;
@@ -840,7 +847,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     if constexpr (kManageLocalTopK) {
       // sync threads before the topk merging operation, because we reuse smem_buf
       __syncthreads();
-      block_topk.done();
+      block_topk.done(smem_buf);
       block_topk.store(out_scores, out_indices);
       if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
     } else {
@@ -1050,9 +1057,11 @@ struct ivfpq_compute_similarity {
 
       [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
       {
-        return manage_local_topk ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
-                                     n_threads / subwarp_size, topk)
-                                 : 0;
+        return manage_local_topk
+                 ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide<OutT,
+                                                                                            IdxT>(
+                     n_threads / subwarp_size, topk)
+                 : 0;
       }
     } ltk_mem{manage_local_topk, topk};
 
@@ -1253,7 +1262,7 @@ inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_que
  *      is guaranteed to fit into GPU memory.
  */
 template <typename ScoreT, typename LutT, typename IdxT>
-void ivfpq_search_worker(const handle_t& handle,
+void ivfpq_search_worker(raft::device_resources const& handle,
                          const index<IdxT>& index,
                          uint32_t max_samples,
                          uint32_t n_probes,
@@ -1407,16 +1416,16 @@ void ivfpq_search_worker(const handle_t& handle,
 
   // Select topk vectors for each query
   rmm::device_uvector<ScoreT> topk_dists(n_queries * topK, stream, mr);
-  select_topk<ScoreT, IdxT>(distances_buf.data(),
-                            neighbors_ptr,
-                            n_queries,
-                            topk_len,
-                            topK,
-                            topk_dists.data(),
-                            neighbors,
-                            true,
-                            stream,
-                            mr);
+  matrix::detail::select_k<ScoreT, IdxT>(distances_buf.data(),
+                                         neighbors_ptr,
+                                         n_queries,
+                                         topk_len,
+                                         topK,
+                                         topk_dists.data(),
+                                         neighbors,
+                                         true,
+                                         stream,
+                                         mr);
 
   // Postprocessing
   postprocess_distances(
@@ -1519,11 +1528,7 @@ inline auto get_max_batch_size(uint32_t k,
   };
   constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024;
   if (ws_size(max_batch_size) > kMaxWsSize) {
-    uint32_t smaller_batch_size = 1;
-    // take powers of two for better alignment
-    while (smaller_batch_size * 2 <= max_batch_size) {
-      smaller_batch_size <<= 1;
-    }
+    uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size);
     // gradually reduce the batch size until we fit into the max size limit.
     while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) {
       smaller_batch_size >>= 1;
@@ -1535,7 +1540,7 @@ inline auto get_max_batch_size(uint32_t k,
 
 /** See raft::spatial::knn::ivf_pq::search docs */
 template <typename T, typename IdxT>
-inline void search(const handle_t& handle,
+inline void search(raft::device_resources const& handle,
                    const search_params& params,
                    const index<IdxT>& index,
                    const T* queries,
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 086cae1089..2b2b5cee0c 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,13 +23,12 @@
 #include <rmm/device_uvector.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
 
 #include <cstdint>
 #include <iostream>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
@@ -61,7 +60,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
                                        int k,
                                        value_idx* translations)
 {
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+  constexpr int kNumWarps = tpb / WarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
@@ -69,8 +68,8 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   /**
    * Uses shared memory
    */
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+  faiss_select::
+    BlockSelect<value_t, value_idx, false, faiss_select::Comparator<value_t>, warp_q, thread_q, tpb>
       heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
@@ -88,7 +87,7 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   value_t* inKStart   = inK + (row_idx + col);
   value_idx* inVStart = inV + (row_idx + col);
 
-  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit             = Pow2<WarpSize>::roundDown(total_k);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
@@ -134,7 +133,7 @@ inline void knn_merge_parts_impl(value_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit = faiss::gpu::Limits<value_t>::getMax();
+  auto kInit = std::numeric_limits<value_t>::max();
   auto vInit = -1;
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
     <<<grid, block, 0, stream>>>(
@@ -216,7 +215,7 @@ inline void knn_merge_parts(value_t* inK,
  */
 template <typename IntType = int, typename IdxType = std::int64_t, typename value_t = float>
 void brute_force_knn_impl(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   std::vector<value_t*>& input,
   std::vector<IntType>& sizes,
   IntType D,
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 27c7e006ca..2cdc0fae91 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
-#include <faiss/gpu/utils/Select.cuh>
+#include <raft/spatial/knn/detail/faiss_select/Select.cuh>
 
 namespace raft {
 namespace spatial {
@@ -50,9 +50,14 @@ __global__ void select_k_kernel(const key_t* inK,
   __shared__ key_t smemK[kNumWarps * warp_q];
   __shared__ payload_t smemV[kNumWarps * warp_q];
 
-  faiss::gpu::
-    BlockSelect<key_t, payload_t, select_min, faiss::gpu::Comparator<key_t>, warp_q, thread_q, tpb>
-      heap(initK, initV, smemK, smemV, k);
+  faiss_select::BlockSelect<key_t,
+                            payload_t,
+                            select_min,
+                            faiss_select::Comparator<key_t>,
+                            warp_q,
+                            thread_q,
+                            tpb>
+    heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 95f7aab9da..ca2c248392 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,11 @@
 
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
-#include <raft/core/device_mdspan.hpp>
-
-#include "detail/topk/radix_topk.cuh"
-#include "detail/topk/warpsort_topk.cuh"
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 
 namespace raft::spatial::knn {
 
@@ -88,6 +87,8 @@ enum class SelectKAlgo {
  * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
  * necessarily sorted. See the `SelectKAlgo` enumeration for more details.
  *
+ * Note: This call is deprecated, please use `raft/matrix/select_k.cuh`
+ *
  * @tparam idx_t
  *   the payload type (what is being selected together with the keys).
  * @tparam value_t
@@ -122,16 +123,17 @@ enum class SelectKAlgo {
  *   the implementation of the algorithm
  */
 template <typename idx_t = int, typename value_t = float>
-inline void select_k(const value_t* in_keys,
-                     const idx_t* in_values,
-                     size_t n_inputs,
-                     size_t input_len,
-                     value_t* out_keys,
-                     idx_t* out_values,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream,
-                     SelectKAlgo algo = SelectKAlgo::FAISS)
+[[deprecated("Use function `select_k` from `raft/matrix/select_k.cuh`")]] inline void select_k(
+  const value_t* in_keys,
+  const idx_t* in_values,
+  size_t n_inputs,
+  size_t input_len,
+  value_t* out_keys,
+  idx_t* out_values,
+  bool select_min,
+  int k,
+  cudaStream_t stream,
+  SelectKAlgo algo = SelectKAlgo::FAISS)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
                                                             select_min ? "min" : "max",
@@ -151,17 +153,17 @@ inline void select_k(const value_t* in_keys,
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      detail::topk::radix_topk<value_t, idx_t, 8, 512>(
+      matrix::detail::select::radix::select_k<value_t, idx_t, 8, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      detail::topk::radix_topk<value_t, idx_t, 11, 512>(
+      matrix::detail::select::radix::select_k<value_t, idx_t, 11, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:
-      detail::topk::warp_sort_topk<value_t, idx_t>(
+      matrix::detail::select::warpsort::select_k<value_t, idx_t>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
@@ -193,7 +195,7 @@ inline void select_k(const value_t* in_keys,
  *            as input vector.
  */
 template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
-void brute_force_knn(raft::handle_t const& handle,
+void brute_force_knn(raft::device_resources const& handle,
                      std::vector<value_t*>& input,
                      std::vector<value_int>& sizes,
                      value_int D,
diff --git a/cpp/include/raft/spectral/cluster_solvers.cuh b/cpp/include/raft/spectral/cluster_solvers.cuh
index 6f9ebcd6af..1cb7aefd13 100644
--- a/cpp/include/raft/spectral/cluster_solvers.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ struct kmeans_solver_t {
   {
   }
 
-  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+  std::pair<value_type_t, index_type_t> solve(raft::device_resources const& handle,
                                               size_type_t n_obs_vecs,
                                               size_type_t dim,
                                               value_type_t const* __restrict__ obs,
diff --git a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
index 0beb1e5836..17dcf6b07c 100644
--- a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ struct kmeans_solver_deprecated_t {
   {
   }
 
-  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+  std::pair<value_type_t, index_type_t> solve(raft::device_resources const& handle,
                                               size_type_t n_obs_vecs,
                                               size_type_t dim,
                                               value_type_t const* __restrict__ obs,
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index f225438841..e32b718117 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cudart_utils.hpp>
@@ -89,7 +89,7 @@ struct vector_view_t {
 template <typename value_type>
 class vector_t {
  public:
-  vector_t(handle_t const& raft_handle, size_type sz)
+  vector_t(device_resources const& raft_handle, size_type sz)
     : buffer_(sz, raft_handle.get_stream()), thrust_policy(raft_handle.get_thrust_policy())
   {
   }
@@ -128,7 +128,7 @@ class vector_t {
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(handle_t const& raft_handle,
+  sparse_matrix_t(device_resources const& raft_handle,
                   index_type const* row_offsets,
                   index_type const* col_indices,
                   value_type const* values,
@@ -145,7 +145,7 @@ struct sparse_matrix_t {
   {
   }
 
-  sparse_matrix_t(handle_t const& raft_handle,
+  sparse_matrix_t(device_resources const& raft_handle,
                   index_type const* row_offsets,
                   index_type const* col_indices,
                   value_type const* values,
@@ -162,7 +162,7 @@ struct sparse_matrix_t {
   }
 
   template <typename CSRView>
-  sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
+  sparse_matrix_t(device_resources const& raft_handle, CSRView const& csr_view)
     : handle_(raft_handle),
       row_offsets_(csr_view.offsets),
       col_indices_(csr_view.indices),
@@ -276,7 +276,7 @@ struct sparse_matrix_t {
 #endif
   }
 
-  handle_t const& get_handle(void) const { return handle_; }
+  device_resources const& get_handle(void) const { return handle_; }
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
   cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const
@@ -292,7 +292,7 @@ struct sparse_matrix_t {
   // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence,
   // aggregate
 
-  handle_t const& handle_;
+  raft::device_resources const& handle_;
   index_type const* row_offsets_;
   index_type const* col_indices_;
   value_type const* values_;
@@ -303,7 +303,7 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  laplacian_matrix_t(handle_t const& raft_handle,
+  laplacian_matrix_t(device_resources const& raft_handle,
                      index_type const* row_offsets,
                      index_type const* col_indices,
                      value_type const* values,
@@ -318,7 +318,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
-  laplacian_matrix_t(handle_t const& raft_handle,
+  laplacian_matrix_t(device_resources const& raft_handle,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
     : sparse_matrix_t<index_type, value_type>(raft_handle,
                                               csr_m.row_offsets_,
@@ -376,7 +376,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
 template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
-  modularity_matrix_t(handle_t const& raft_handle,
+  modularity_matrix_t(device_resources const& raft_handle,
                       index_type const* row_offsets,
                       index_type const* col_indices,
                       value_type const* values,
@@ -388,7 +388,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
-  modularity_matrix_t(handle_t const& raft_handle,
+  modularity_matrix_t(device_resources const& raft_handle,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
     : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m)
   {
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index b60ca719fb..160664bae8 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,7 +81,7 @@ namespace detail {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -140,7 +140,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const& handle,
+void analyzeModularity(raft::device_resources const& handle,
                        raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 1e0cc78826..6750f5d93f 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ namespace detail {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -131,7 +131,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @return error flag.
  */
 template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const& handle,
+void analyzePartition(raft::device_resources const& handle,
                       spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 3a0ad1f96f..ae75031522 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -72,7 +72,7 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_ty
 
   // scale by alpha
   alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
-  alpha = std::sqrt(alpha);
+  alpha = raft::sqrt(alpha);
   for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
       index      = i + j * m;
@@ -116,7 +116,10 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, weight_t* eigVecs)
+void transform_eigen_matrix(raft::device_resources const& handle,
+                            edge_t n,
+                            vertex_t nEigVecs,
+                            weight_t* eigVecs)
 {
   auto stream             = handle.get_stream();
   auto cublas_h           = handle.get_cublas_handle();
@@ -207,7 +210,7 @@ struct equal_to_i_op {
 // Construct indicator vector for ith partition
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
-bool construct_indicator(handle_t const& handle,
+bool construct_indicator(raft::device_resources const& handle,
                          edge_t index,
                          edge_t n,
                          weight_t& clustersize,
diff --git a/cpp/include/raft/spectral/eigen_solvers.cuh b/cpp/include/raft/spectral/eigen_solvers.cuh
index 88e4abe513..3f6959d2e2 100644
--- a/cpp/include/raft/spectral/eigen_solvers.cuh
+++ b/cpp/include/raft/spectral/eigen_solvers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ struct lanczos_solver_t {
   }
 
   index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
+    raft::device_resources const& handle,
     matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
@@ -73,7 +73,7 @@ struct lanczos_solver_t {
   }
 
   index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
+    raft::device_resources const& handle,
     matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
index 61d85aefaa..29d432c042 100644
--- a/cpp/include/raft/spectral/modularity_maximization.cuh
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ namespace spectral {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -70,7 +70,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const& handle,
+void analyzeModularity(raft::device_resources const& handle,
                        matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index 2d21f2223c..0dec230752 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ namespace spectral {
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const& handle,
+  raft::device_resources const& handle,
   matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
@@ -78,7 +78,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @param cost On exit, partition cost function.
  */
 template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const& handle,
+void analyzePartition(raft::device_resources const& handle,
                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
index 3bd0d40091..7a5780fbc9 100644
--- a/cpp/include/raft/stats/accuracy.cuh
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ float accuracy(const math_t* predictions, const math_t* ref_predictions, int n,
  * @return: Accuracy score in [0, 1]; higher is better.
  */
 template <typename value_t, typename idx_t>
-float accuracy(const raft::handle_t& handle,
+float accuracy(raft::device_resources const& handle,
                raft::device_vector_view<const value_t, idx_t> predictions,
                raft::device_vector_view<const value_t, idx_t> ref_predictions)
 {
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 402071f498..d2815fe753 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ double adjusted_rand_index(const T* firstClusterArray,
  * @return the Adjusted RandIndex
  */
 template <typename value_t, typename math_t, typename idx_t>
-double adjusted_rand_index(const raft::handle_t& handle,
+double adjusted_rand_index(raft::device_resources const& handle,
                            raft::device_vector_view<const value_t, idx_t> first_cluster_array,
                            raft::device_vector_view<const value_t, idx_t> second_cluster_array)
 {
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index a39315dc7a..17ff658ac8 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,7 +65,7 @@ double completeness_score(const T* truthClusterArray,
  * @return the cluster completeness score
  */
 template <typename value_t, typename idx_t>
-double completeness_score(const raft::handle_t& handle,
+double completeness_score(raft::device_resources const& handle,
                           raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
                           raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
                           value_t lower_label_range,
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index ecf9c744ee..e309e8b4b9 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/stats/detail/contingencyMatrix.cuh>
 
@@ -120,7 +120,7 @@ void contingencyMatrix(const T* groundTruth,
  * @param[out] maxLabel: calculated max value in input array
  */
 template <typename value_t, typename idx_t>
-void get_input_class_cardinality(const raft::handle_t& handle,
+void get_input_class_cardinality(raft::device_resources const& handle,
                                  raft::device_vector_view<const value_t, idx_t> groundTruth,
                                  raft::host_scalar_view<value_t> minLabel,
                                  raft::host_scalar_view<value_t> maxLabel)
@@ -158,7 +158,7 @@ template <typename value_t,
           typename layout_t,
           typename opt_min_label_t,
           typename opt_max_label_t>
-void contingency_matrix(const raft::handle_t& handle,
+void contingency_matrix(raft::device_resources const& handle,
                         raft::device_vector_view<const value_t, idx_t> ground_truth,
                         raft::device_vector_view<const value_t, idx_t> predicted_label,
                         raft::device_matrix_view<out_t, idx_t, layout_t> out_mat,
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
index 9829564a5a..c0c387e067 100644
--- a/cpp/include/raft/stats/cov.cuh
+++ b/cpp/include/raft/stats/cov.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ namespace stats {
  * function returns!
  */
 template <typename Type>
-void cov(const raft::handle_t& handle,
+void cov(raft::device_resources const& handle,
          Type* covar,
          Type* data,
          const Type* mu,
@@ -85,7 +85,7 @@ void cov(const raft::handle_t& handle,
  * function returns!
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void cov(const raft::handle_t& handle,
+void cov(raft::device_resources const& handle,
          raft::device_matrix_view<value_t, idx_t, layout_t> data,
          raft::device_vector_view<const value_t, idx_t> mu,
          raft::device_matrix_view<value_t, idx_t, layout_t> covar,
diff --git a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
index 5bd8c4a7ab..a184fe22ef 100644
--- a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,7 +111,7 @@ __global__ void compute_chunked_a_b_kernel(value_t* a,
 }
 
 template <typename value_idx, typename label_idx>
-rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
+rmm::device_uvector<value_idx> get_cluster_counts(raft::device_resources const& handle,
                                                   const label_idx* y,
                                                   value_idx& n_rows,
                                                   label_idx& n_labels)
@@ -128,7 +128,7 @@ rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
 }
 
 template <typename value_t, typename value_idx>
-rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
+rmm::device_uvector<value_t> get_pairwise_distance(raft::device_resources const& handle,
                                                    const value_t* left_begin,
                                                    const value_t* right_begin,
                                                    value_idx& n_left_rows,
@@ -146,7 +146,7 @@ rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
 }
 
 template <typename value_t, typename value_idx, typename label_idx>
-void compute_chunked_a_b(const raft::handle_t& handle,
+void compute_chunked_a_b(raft::device_resources const& handle,
                          value_t* a,
                          value_t* b,
                          value_idx& row_offset,
@@ -169,7 +169,7 @@ void compute_chunked_a_b(const raft::handle_t& handle,
 
 template <typename value_t, typename value_idx, typename label_idx>
 value_t silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const value_t* X,
   value_idx n_rows,
   value_idx n_cols,
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index 24de58dd91..0561ac269b 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace detail {
  * function returns!
  */
 template <typename Type>
-void cov(const raft::handle_t& handle,
+void cov(raft::device_resources const& handle,
          Type* covar,
          Type* data,
          const Type* mu,
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index 3cf95c3941..f3839b99c8 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -191,7 +191,7 @@ struct SilOp {
  */
 template <typename DataT, typename LabelT>
 DataT silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const DataT* X_in,
   int nRows,
   int nCols,
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index ccea2ea5da..2f7e22ca8a 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ __global__ void stddevKernelColMajor(
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); }
+  if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); }
 }
 
 template <typename Type, typename IdxType, int TPB>
@@ -126,7 +126,7 @@ void stddev(Type* std,
       std,
       mu,
       D,
-      [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); },
+      [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); },
       stream);
   } else {
     stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index feb3fe607d..23f84754da 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ __global__ void compute_rank(double* rank,
  * @param[out] distances KNN distances
  */
 template <raft::distance::DistanceType distance_type, typename math_t>
-void run_knn(const raft::handle_t& h,
+void run_knn(const raft::device_resources& h,
              math_t* input,
              int n,
              int d,
@@ -128,7 +128,7 @@ void run_knn(const raft::handle_t& h,
  * @return Trustworthiness score
  */
 template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
+double trustworthiness_score(const raft::device_resources& h,
                              const math_t* X,
                              math_t* X_embedded,
                              int n,
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
index 3e4b2d7d50..8600305d9e 100644
--- a/cpp/include/raft/stats/dispersion.cuh
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,7 +81,7 @@ DataT dispersion(const DataT* centroids,
  */
 template <typename value_t, typename idx_t>
 value_t cluster_dispersion(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> centroids,
   raft::device_vector_view<const idx_t, idx_t> cluster_sizes,
   std::optional<raft::device_vector_view<value_t, idx_t>> global_centroid,
@@ -117,7 +117,7 @@ value_t cluster_dispersion(
  */
 template <typename value_t, typename idx_t>
 value_t cluster_dispersion(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> centroids,
   raft::device_vector_view<const idx_t, idx_t> cluster_sizes,
   std::nullopt_t global_centroid,
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index 60836c8782..d59dc8e37a 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ double entropy(const T* clusterArray,
  * @return the entropy score
  */
 template <typename value_t, typename idx_t>
-double entropy(const raft::handle_t& handle,
+double entropy(raft::device_resources const& handle,
                raft::device_vector_view<const value_t, idx_t> cluster_array,
                const value_t lower_label_range,
                const value_t upper_label_range)
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index 79fa881070..f829b0317e 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ void histogram(HistType type,
  * @note signature of binner_op is `int func(value_t, IdxT);`
  */
 template <typename value_t, typename idx_t, typename binner_op = IdentityBinner<value_t, idx_t>>
-void histogram(const raft::handle_t& handle,
+void histogram(raft::device_resources const& handle,
                HistType type,
                raft::device_matrix_view<const value_t, idx_t, raft::col_major> data,
                raft::device_matrix_view<int, idx_t, raft::col_major> bins,
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index e1b627f8f1..173d63e47e 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ double homogeneity_score(const T* truthClusterArray,
  * @return the homogeneity score
  */
 template <typename value_t, typename idx_t>
-double homogeneity_score(const raft::handle_t& handle,
+double homogeneity_score(raft::device_resources const& handle,
                          raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
                          raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
                          value_t lower_label_range,
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
index bddd603d01..b54f126859 100644
--- a/cpp/include/raft/stats/information_criterion.cuh
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/batched/information_criterion.cuh>
 #include <raft/stats/stats_types.hpp>
 
@@ -91,7 +91,7 @@ void information_criterion_batched(ScalarT* d_ic,
  * @param[in]  n_samples        Number of samples in each series
  */
 template <typename value_t, typename idx_t>
-void information_criterion_batched(const raft::handle_t& handle,
+void information_criterion_batched(raft::device_resources const& handle,
                                    raft::device_vector_view<const value_t, idx_t> d_loglikelihood,
                                    raft::device_vector_view<value_t, idx_t> d_ic,
                                    IC_Type ic_type,
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
index e969af7633..d27f736255 100644
--- a/cpp/include/raft/stats/kl_divergence.cuh
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size,
  * @return the KL Divergence value
  */
 template <typename value_t, typename idx_t>
-value_t kl_divergence(const raft::handle_t& handle,
+value_t kl_divergence(raft::device_resources const& handle,
                       raft::device_vector_view<const value_t, idx_t> modelPDF,
                       raft::device_vector_view<const value_t, idx_t> candidatePDF)
 {
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index c944247b33..a576e63bee 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/mean.cuh>
 
 namespace raft {
@@ -70,7 +70,7 @@ void mean(
  *   to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void mean(const raft::handle_t& handle,
+void mean(raft::device_resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
           raft::device_vector_view<value_t, idx_t> mu,
           bool sample)
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index b33b1bf507..b333b3c8da 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ void meanAdd(Type* out,
  * @param[in]  bcast_along_rows whether to broadcast vector along rows or columns
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void mean_center(const raft::handle_t& handle,
+void mean_center(raft::device_resources const& handle,
                  raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                  raft::device_vector_view<const value_t, idx_t> mu,
                  raft::device_matrix_view<value_t, idx_t, layout_t> out,
@@ -133,7 +133,7 @@ void mean_center(const raft::handle_t& handle,
  * @param[in]  bcast_along_rows whether to broadcast vector along rows or columns
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void mean_add(const raft::handle_t& handle,
+void mean_add(raft::device_resources const& handle,
               raft::device_matrix_view<const value_t, idx_t, layout_t> data,
               raft::device_vector_view<const value_t, idx_t> mu,
               raft::device_matrix_view<value_t, idx_t, layout_t> out,
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index b839ae7b72..0ee21d1325 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ void meanvar(Type* mean,
  * normalize the variance using N-1 or N, for true or false respectively.
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void meanvar(const raft::handle_t& handle,
+void meanvar(raft::device_resources const& handle,
              raft::device_matrix_view<const value_t, idx_t, layout_t> data,
              raft::device_vector_view<value_t, idx_t> mean,
              raft::device_vector_view<value_t, idx_t> var,
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index 24e432e8da..8af4f7a92c 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ void minmax(const T* data,
  *    in shared memory
  */
 template <typename value_t, typename idx_t>
-void minmax(const raft::handle_t& handle,
+void minmax(raft::device_resources const& handle,
             raft::device_matrix_view<const value_t, idx_t, raft::col_major> data,
             std::optional<raft::device_vector_view<const unsigned, idx_t>> rowids,
             std::optional<raft::device_vector_view<const unsigned, idx_t>> colids,
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 8158c4020a..ca7f33d398 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,7 +65,7 @@ double mutual_info_score(const T* firstClusterArray,
  * @return the mutual information score
  */
 template <typename value_t, typename idx_t>
-double mutual_info_score(const raft::handle_t& handle,
+double mutual_info_score(raft::device_resources const& handle,
                          raft::device_vector_view<const value_t, idx_t> first_cluster_array,
                          raft::device_vector_view<const value_t, idx_t> second_cluster_array,
                          value_t lower_label_range,
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index 5e867b64a1..1048deb7f3 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
  * @note The constness of y and y_hat is currently casted away.
  */
 template <typename value_t, typename idx_t>
-value_t r2_score(const raft::handle_t& handle,
+value_t r2_score(raft::device_resources const& handle,
                  raft::device_vector_view<const value_t, idx_t> y,
                  raft::device_vector_view<const value_t, idx_t> y_hat)
 {
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index 09dc874d93..25b92e4e10 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/rand_index.cuh>
 
 namespace raft {
@@ -55,7 +55,7 @@ double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cu
  * @return: The RandIndex value.
  */
 template <typename value_t, typename idx_t>
-double rand_index(const raft::handle_t& handle,
+double rand_index(raft::device_resources const& handle,
                   raft::device_vector_view<const value_t, idx_t> first_cluster_array,
                   raft::device_vector_view<const value_t, idx_t> second_cluster_array)
 {
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index c45e3be870..7c3ca7386b 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/stats/detail/scores.cuh>
 
@@ -73,7 +73,7 @@ void regression_metrics(const T* predictions,
  * ref_predictions[i]| for i in [0, n).
  */
 template <typename value_t, typename idx_t>
-void regression_metrics(const raft::handle_t& handle,
+void regression_metrics(raft::device_resources const& handle,
                         raft::device_vector_view<const value_t, idx_t> predictions,
                         raft::device_vector_view<const value_t, idx_t> ref_predictions,
                         raft::host_scalar_view<double> mean_abs_error,
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
index 0763cdfe1c..db9db1f99a 100644
--- a/cpp/include/raft/stats/silhouette_score.cuh
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace stats {
  */
 template <typename DataT, typename LabelT>
 DataT silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   DataT* X_in,
   int nRows,
   int nCols,
@@ -60,7 +60,7 @@ DataT silhouette_score(
 
 template <typename value_t, typename value_idx, typename label_idx>
 value_t silhouette_score_batched(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   value_t* X,
   value_idx n_rows,
   value_idx n_cols,
@@ -98,7 +98,7 @@ value_t silhouette_score_batched(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_in,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::optional<raft::device_vector_view<value_t, idx_t>> silhouette_score_per_sample,
@@ -144,7 +144,7 @@ value_t silhouette_score(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score_batched(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::optional<raft::device_vector_view<value_t, idx_t>> silhouette_score_per_sample,
@@ -187,7 +187,7 @@ value_t silhouette_score_batched(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_in,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::nullopt_t silhouette_score_per_sample,
@@ -207,7 +207,7 @@ value_t silhouette_score(
  */
 template <typename value_t, typename label_t, typename idx_t>
 value_t silhouette_score_batched(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X,
   raft::device_vector_view<const label_t, idx_t> labels,
   std::nullopt_t silhouette_score_per_sample,
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 27bd95f0ad..0b038c85ea 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/stddev.cuh>
 
 namespace raft {
@@ -109,7 +109,7 @@ void vars(Type* var,
  *  to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void stddev(const raft::handle_t& handle,
+void stddev(raft::device_resources const& handle,
             raft::device_matrix_view<const value_t, idx_t, layout_t> data,
             raft::device_vector_view<const value_t, idx_t> mu,
             raft::device_vector_view<value_t, idx_t> std,
@@ -156,7 +156,7 @@ void stddev(const raft::handle_t& handle,
  *  to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void vars(const raft::handle_t& handle,
+void vars(raft::device_resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
           raft::device_vector_view<const value_t, idx_t> mu,
           raft::device_vector_view<value_t, idx_t> var,
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index c3aafe286a..5f169b3384 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
  * @param[out] output the output mean vector
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void sum(const raft::handle_t& handle,
+void sum(raft::device_resources const& handle,
          raft::device_matrix_view<const value_t, idx_t, layout_t> input,
          raft::device_vector_view<value_t, idx_t> output)
 {
diff --git a/cpp/include/raft/stats/trustworthiness_score.cuh b/cpp/include/raft/stats/trustworthiness_score.cuh
index e8e60871d0..a79cda8dfc 100644
--- a/cpp/include/raft/stats/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/trustworthiness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #pragma once
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/trustworthiness_score.cuh>
 
 namespace raft {
@@ -38,7 +38,7 @@ namespace stats {
  * @return[out] Trustworthiness score
  */
 template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
+double trustworthiness_score(const raft::device_resources& h,
                              const math_t* X,
                              math_t* X_embedded,
                              int n,
@@ -71,7 +71,7 @@ double trustworthiness_score(const raft::handle_t& h,
  */
 template <raft::distance::DistanceType distance_type, typename value_t, typename idx_t>
 double trustworthiness_score(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X,
   raft::device_matrix_view<const value_t, idx_t, raft::row_major> X_embedded,
   int n_neighbors,
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index a5f5b58365..be1d83d59d 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #pragma once
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/detail/v_measure.cuh>
 
 namespace raft {
@@ -68,7 +68,7 @@ double v_measure(const T* truthClusterArray,
  * @return the v-measure between the two clusters
  */
 template <typename value_t, typename idx_t>
-double v_measure(const raft::handle_t& handle,
+double v_measure(raft::device_resources const& handle,
                  raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
                  raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
                  value_t lower_label_range,
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 8c7f1ca7b5..7f061e0b45 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ void colWeightedMean(
  * @param[in]  along_rows whether to reduce along rows or columns
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void weighted_mean(const raft::handle_t& handle,
+void weighted_mean(raft::device_resources const& handle,
                    raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                    raft::device_vector_view<const value_t, idx_t> weights,
                    raft::device_vector_view<value_t, idx_t> mu,
@@ -154,7 +154,7 @@ void weighted_mean(const raft::handle_t& handle,
  * @param[out] mu the output mean vector of size nrows
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void row_weighted_mean(const raft::handle_t& handle,
+void row_weighted_mean(raft::device_resources const& handle,
                        raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                        raft::device_vector_view<const value_t, idx_t> weights,
                        raft::device_vector_view<value_t, idx_t> mu)
@@ -175,7 +175,7 @@ void row_weighted_mean(const raft::handle_t& handle,
  * @param[out] mu the output mean vector of size ncols
  */
 template <typename value_t, typename idx_t, typename layout_t>
-void col_weighted_mean(const raft::handle_t& handle,
+void col_weighted_mean(raft::device_resources const& handle,
                        raft::device_matrix_view<const value_t, idx_t, layout_t> data,
                        raft::device_vector_view<const value_t, idx_t> weights,
                        raft::device_vector_view<value_t, idx_t> mu)
diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/util/bitonic_sort.cuh
similarity index 68%
rename from cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
rename to cpp/include/raft/util/bitonic_sort.cuh
index 630acab2b8..5de464b4c7 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
+++ b/cpp/include/raft/util/bitonic_sort.cuh
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/util/cuda_utils.cuh>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::util {
 
-namespace helpers {
+namespace {
 
 template <typename T>
-__device__ __forceinline__ void swap(T& x, T& y)
+_RAFT_DEVICE _RAFT_FORCEINLINE void swap(T& x, T& y)
 {
   T t = x;
   x   = y;
@@ -31,12 +32,12 @@ __device__ __forceinline__ void swap(T& x, T& y)
 }
 
 template <typename T>
-__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
+_RAFT_DEVICE _RAFT_FORCEINLINE void conditional_assign(bool cond, T& ptr, T x)
 {
   if (cond) { ptr = x; }
 }
 
-}  // namespace helpers
+}  // namespace
 
 /**
  * Warp-wide bitonic merge and sort.
@@ -59,6 +60,19 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
  *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
  * `
  *
+ * Here is a small usage example of device code, which sorts the arrays of length 6 (= 3 * 2)
+ * grouped in pairs of threads in ascending order:
+ * @code{.cpp}
+ *   // Fill an array of three ints in each thread of a warp.
+ *   int i = laneId();
+ *   int arr[3] = {i+1, i+5, i};
+ *   // Sort the arrays in groups of two threads.
+ *   bitonic<3>(ascending=true, warp_width=2).sort(arr);
+ *   // As a result,
+ *   //  for every even thread (`i == 2j`):    arr == {2j,   2j+1, 2j+5}
+ *   //  for every odd  thread (`i == 2j+1`):  arr == {2j+1, 2j+2, 2j+6}
+ * @endcode
+ *
  * @tparam Size
  *   number of elements processed in each thread;
  *   i.e. the total data size is `Size * warp_width`.
@@ -80,7 +94,7 @@ class bitonic {
    *   the total size of the sorted data is `Size * warp_width`.
    *   Must be power-of-two, not larger than the WarpSize.
    */
-  __device__ __forceinline__ explicit bitonic(bool ascending, int warp_width = WarpSize)
+  _RAFT_DEVICE _RAFT_FORCEINLINE explicit bitonic(bool ascending, int warp_width = WarpSize)
     : ascending_(ascending), warp_width_(warp_width)
   {
   }
@@ -95,7 +109,7 @@ class bitonic {
    *
    *   1) Sort any bitonic sequence.
    *   2) Merge two halves of the input data assuming they're already sorted, and their order is
-   *      opposite (i.e. either ascending, descending or vice-versa).
+   *      opposite (i.e. either ascending+descending or descending+ascending).
    *
    * The input pointers are unique per-thread.
    * See the class description for the description of the data layout.
@@ -108,10 +122,10 @@ class bitonic {
    *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
-                                        PayloadTs* __restrict__... payloads) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge(KeyT* __restrict__ keys,
+                                            PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::merge_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -127,10 +141,10 @@ class bitonic {
    *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
-                                       PayloadTs* __restrict__... payloads) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE void sort(KeyT* __restrict__ keys,
+                                           PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::sort_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -141,8 +155,8 @@ class bitonic {
    * @param payload
    */
   template <typename KeyT, typename... PayloadTs, int S = Size>
-  __device__ __forceinline__ auto merge(KeyT& __restrict__ key,
-                                        PayloadTs& __restrict__... payload) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE auto merge(KeyT& __restrict__ key,
+                                            PayloadTs& __restrict__... payload) const
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
@@ -157,8 +171,8 @@ class bitonic {
    * @param payload
    */
   template <typename KeyT, typename... PayloadTs, int S = Size>
-  __device__ __forceinline__ auto sort(KeyT& __restrict__ key,
-                                       PayloadTs& __restrict__... payload) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE auto sort(KeyT& __restrict__ key,
+                                           PayloadTs& __restrict__... payload) const
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
@@ -173,10 +187,10 @@ class bitonic {
   friend class bitonic;
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void merge_(bool ascending,
-                                                int warp_width,
-                                                KeyT* __restrict__ keys,
-                                                PayloadTs* __restrict__... payloads)
+  static _RAFT_DEVICE _RAFT_FORCEINLINE void merge_impl(bool ascending,
+                                                        int warp_width,
+                                                        KeyT* __restrict__ keys,
+                                                        PayloadTs* __restrict__... payloads)
   {
 #pragma unroll
     for (int size = Size; size > 1; size >>= 1) {
@@ -189,8 +203,8 @@ class bitonic {
           KeyT& key         = keys[i];
           KeyT& other       = keys[other_i];
           if (ascending ? key > other : key < other) {
-            helpers::swap(key, other);
-            (helpers::swap(payloads[i], payloads[other_i]), ...);
+            swap(key, other);
+            (swap(payloads[i], payloads[other_i]), ...);
           }
         }
       }
@@ -204,33 +218,32 @@ class bitonic {
         const KeyT other     = shfl_xor(key, stride, warp_width);
         const bool do_assign = (ascending != is_second) ? key > other : key < other;
 
-        helpers::conditional_assign(do_assign, key, other);
+        conditional_assign(do_assign, key, other);
         // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-        (helpers::conditional_assign(
-           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+        (conditional_assign(do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
          ...);
       }
     }
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void sort_(bool ascending,
-                                               int warp_width,
-                                               KeyT* __restrict__ keys,
-                                               PayloadTs* __restrict__... payloads)
+  static _RAFT_DEVICE _RAFT_FORCEINLINE void sort_impl(bool ascending,
+                                                       int warp_width,
+                                                       KeyT* __restrict__ keys,
+                                                       PayloadTs* __restrict__... payloads)
   {
     if constexpr (Size == 1) {
       const int lane = laneId();
       for (int width = 2; width < warp_width; width <<= 1) {
-        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+        bitonic<1>::merge_impl(lane & width, width, keys, payloads...);
       }
     } else {
       constexpr int kSize2 = Size / 2;
-      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
-      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+      bitonic<kSize2>::sort_impl(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_impl(true, warp_width, keys + kSize2, (payloads + kSize2)...);
     }
-    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+    bitonic<Size>::merge_impl(ascending, warp_width, keys, payloads...);
   }
 };
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::util
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
index ccd5d1ab86..77e3ed2d6d 100644
--- a/cpp/include/raft/util/cache.cuh
+++ b/cpp/include/raft/util/cache.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ namespace raft::cache {
  * // We assume that our ML algo repeatedly calls calc, and the set of keys have
  * // an overlap. We will use the cache to avoid repeated calculations.
  *
- * // Assume we have raft::handle_t& h, and cudaStream_t stream
+ * // Assume we have raft::device_resources& h, and cudaStream_t stream
  * Cache<float> cache(h.get_device_allocator(), stream, m);
  *
  * // A buffer that we will reuse to store the cache indices.
diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh
index 61dd6e0ad8..5be9dc999a 100644
--- a/cpp/include/raft/util/cuda_utils.cuh
+++ b/cpp/include/raft/util/cuda_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <type_traits>
 
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/math.hpp>
 #include <raft/core/operators.hpp>
 
 #ifndef ENABLE_MEMCPY_ASYNC
@@ -259,12 +260,14 @@ DI double myAtomicMax(double* address, double val)
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
-HDI float myMax<float>(float x, float y)
+[[deprecated("use raft::max from raft/core/math.hpp instead")]] HDI float myMax<float>(float x,
+                                                                                       float y)
 {
   return fmaxf(x, y);
 }
 template <>
-HDI double myMax<double>(double x, double y)
+[[deprecated("use raft::max from raft/core/math.hpp instead")]] HDI double myMax<double>(double x,
+                                                                                         double y)
 {
   return fmax(x, y);
 }
@@ -277,12 +280,14 @@ HDI double myMax<double>(double x, double y)
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
-HDI float myMin<float>(float x, float y)
+[[deprecated("use raft::min from raft/core/math.hpp instead")]] HDI float myMin<float>(float x,
+                                                                                       float y)
 {
   return fminf(x, y);
 }
 template <>
-HDI double myMin<double>(double x, double y)
+[[deprecated("use raft::min from raft/core/math.hpp instead")]] HDI double myMin<double>(double x,
+                                                                                         double y)
 {
   return fmin(x, y);
 }
@@ -298,7 +303,7 @@ HDI double myMin<double>(double x, double y)
 template <typename T>
 DI T myAtomicMin(T* address, T val)
 {
-  myAtomicReduce(address, val, myMin<T>);
+  myAtomicReduce(address, val, raft::min_op{});
   return *address;
 }
 
@@ -312,19 +317,10 @@ DI T myAtomicMin(T* address, T val)
 template <typename T>
 DI T myAtomicMax(T* address, T val)
 {
-  myAtomicReduce(address, val, myMax<T>);
+  myAtomicReduce(address, val, raft::max_op{});
   return *address;
 }
 
-/**
- * Sign function
- */
-template <typename T>
-HDI int sgn(const T val)
-{
-  return (T(0) < val) - (val < T(0));
-}
-
 /**
  * @defgroup Exp Exponential function
  * @{
@@ -332,14 +328,14 @@ HDI int sgn(const T val)
 template <typename T>
 HDI T myExp(T x);
 template <>
-HDI float myExp(float x)
+[[deprecated("use raft::exp from raft/core/math.hpp instead")]] HDI float myExp(float x)
 {
   return expf(x);
 }
 template <>
-HDI double myExp(double x)
+[[deprecated("use raft::exp from raft/core/math.hpp instead")]] HDI double myExp(double x)
 {
-  return exp(x);
+  return ::exp(x);
 }
 /** @} */
 
@@ -368,14 +364,14 @@ inline __device__ double myInf<double>()
 template <typename T>
 HDI T myLog(T x);
 template <>
-HDI float myLog(float x)
+[[deprecated("use raft::log from raft/core/math.hpp instead")]] HDI float myLog(float x)
 {
   return logf(x);
 }
 template <>
-HDI double myLog(double x)
+[[deprecated("use raft::log from raft/core/math.hpp instead")]] HDI double myLog(double x)
 {
-  return log(x);
+  return ::log(x);
 }
 /** @} */
 
@@ -386,14 +382,14 @@ HDI double myLog(double x)
 template <typename T>
 HDI T mySqrt(T x);
 template <>
-HDI float mySqrt(float x)
+[[deprecated("use raft::sqrt from raft/core/math.hpp instead")]] HDI float mySqrt(float x)
 {
   return sqrtf(x);
 }
 template <>
-HDI double mySqrt(double x)
+[[deprecated("use raft::sqrt from raft/core/math.hpp instead")]] HDI double mySqrt(double x)
 {
-  return sqrt(x);
+  return ::sqrt(x);
 }
 /** @} */
 
@@ -404,14 +400,18 @@ HDI double mySqrt(double x)
 template <typename T>
 DI void mySinCos(T x, T& s, T& c);
 template <>
-DI void mySinCos(float x, float& s, float& c)
+[[deprecated("use raft::sincos from raft/core/math.hpp instead")]] DI void mySinCos(float x,
+                                                                                    float& s,
+                                                                                    float& c)
 {
   sincosf(x, &s, &c);
 }
 template <>
-DI void mySinCos(double x, double& s, double& c)
+[[deprecated("use raft::sincos from raft/core/math.hpp instead")]] DI void mySinCos(double x,
+                                                                                    double& s,
+                                                                                    double& c)
 {
-  sincos(x, &s, &c);
+  ::sincos(x, &s, &c);
 }
 /** @} */
 
@@ -422,14 +422,14 @@ DI void mySinCos(double x, double& s, double& c)
 template <typename T>
 DI T mySin(T x);
 template <>
-DI float mySin(float x)
+[[deprecated("use raft::sin from raft/core/math.hpp instead")]] DI float mySin(float x)
 {
   return sinf(x);
 }
 template <>
-DI double mySin(double x)
+[[deprecated("use raft::sin from raft/core/math.hpp instead")]] DI double mySin(double x)
 {
-  return sin(x);
+  return ::sin(x);
 }
 /** @} */
 
@@ -443,12 +443,12 @@ DI T myAbs(T x)
   return x < 0 ? -x : x;
 }
 template <>
-DI float myAbs(float x)
+[[deprecated("use raft::abs from raft/core/math.hpp instead")]] DI float myAbs(float x)
 {
   return fabsf(x);
 }
 template <>
-DI double myAbs(double x)
+[[deprecated("use raft::abs from raft/core/math.hpp instead")]] DI double myAbs(double x)
 {
   return fabs(x);
 }
@@ -461,14 +461,16 @@ DI double myAbs(double x)
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
-HDI float myPow(float x, float power)
+[[deprecated("use raft::pow from raft/core/math.hpp instead")]] HDI float myPow(float x,
+                                                                                float power)
 {
   return powf(x, power);
 }
 template <>
-HDI double myPow(double x, double power)
+[[deprecated("use raft::pow from raft/core/math.hpp instead")]] HDI double myPow(double x,
+                                                                                 double power)
 {
-  return pow(x, power);
+  return ::pow(x, power);
 }
 /** @} */
 
@@ -479,14 +481,14 @@ HDI double myPow(double x, double power)
 template <typename T>
 HDI T myTanh(T x);
 template <>
-HDI float myTanh(float x)
+[[deprecated("use raft::tanh from raft/core/math.hpp instead")]] HDI float myTanh(float x)
 {
   return tanhf(x);
 }
 template <>
-HDI double myTanh(double x)
+[[deprecated("use raft::tanh from raft/core/math.hpp instead")]] HDI double myTanh(double x)
 {
-  return tanh(x);
+  return ::tanh(x);
 }
 /** @} */
 
@@ -497,14 +499,14 @@ HDI double myTanh(double x)
 template <typename T>
 HDI T myATanh(T x);
 template <>
-HDI float myATanh(float x)
+[[deprecated("use raft::atanh from raft/core/math.hpp instead")]] HDI float myATanh(float x)
 {
   return atanhf(x);
 }
 template <>
-HDI double myATanh(double x)
+[[deprecated("use raft::atanh from raft/core/math.hpp instead")]] HDI double myATanh(double x)
 {
-  return atanh(x);
+  return ::atanh(x);
 }
 /** @} */
 
@@ -526,7 +528,7 @@ struct SqrtOp {
   [[deprecated("SqrtOp is deprecated. Use sqrt_op instead.")]] HDI Type
   operator()(Type in, IdxType i = 0) const
   {
-    return mySqrt(in);
+    return raft::sqrt(in);
   }
 };
 
@@ -544,7 +546,7 @@ struct L1Op {
   [[deprecated("L1Op is deprecated. Use abs_op instead.")]] HDI Type operator()(Type in,
                                                                                 IdxType i = 0) const
   {
-    return myAbs(in);
+    return raft::abs(in);
   }
 };
 
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 68a95da587..7090d4d2bf 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -41,9 +41,6 @@
 #include <memory>
 #include <mutex>
 
-///@todo: enable once logging has been enabled in raft
-//#include "logger.hpp"
-
 namespace raft {
 
 /**
@@ -379,7 +376,12 @@ std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t strea
 
   ss << name << " = [ ";
   for (int i = 0; i < size; i++) {
-    ss << std::setw(width) << arr_h[i];
+    typedef
+      typename std::conditional_t<std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>, int, T>
+        CastT;
+
+    auto val = static_cast<CastT>(arr_h[i]);
+    ss << std::setw(width) << val;
 
     if (i < size - 1) ss << ", ";
   }
diff --git a/cpp/include/raft/util/device_atomics.cuh b/cpp/include/raft/util/device_atomics.cuh
index a79981124f..6e956e8e38 100644
--- a/cpp/include/raft/util/device_atomics.cuh
+++ b/cpp/include/raft/util/device_atomics.cuh
@@ -242,7 +242,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
+// `atomicAdd` supports int, unsigned int, unsigned long long int, float, double (long long int is
 // not supported.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
@@ -519,7 +519,7 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
  * performed in one atomic transaction.
  *
  * The supported types for `atomicMin` are: integers are floating point numbers.
- * CUDA natively supports `int`, `unsigend int`, `unsigned long long int`.
+ * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`.
  *
  * @param[in] address The address of old value in global or shared memory
  * @param[in] val The value to be computed
@@ -540,7 +540,7 @@ __forceinline__ __device__ T atomicMin(T* address, T val)
  * performed in one atomic transaction.
  *
  * The supported types for `atomicMax` are: integers are floating point numbers.
- * CUDA natively supports `int`, `unsigend int`, `unsigned long long int`.
+ * CUDA natively supports `int`, `unsigned int`, `unsigned long long int`.
  *
  * @param[in] address The address of old value in global or shared memory
  * @param[in] val The value to be computed
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index e893ff0904..3b0d9d44ae 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -23,6 +23,7 @@
  *
  */
 
+#include <limits>
 #include <stdexcept>
 #include <type_traits>
 
@@ -112,18 +113,37 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
-  I dividend, I divisor) noexcept
+constexpr inline auto div_rounding_up_safe(I dividend, I divisor) noexcept
+  -> std::enable_if_t<std::is_integral<I>::value, I>
 {
   using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
-  I val) noexcept
+constexpr inline auto is_a_power_of_two(I val) noexcept
+  -> std::enable_if_t<std::is_integral<I>::value, bool>
 {
-  return ((val - 1) & val) == 0;
+  return (val != 0) && (((val - 1) & val) == 0);
+}
+
+/**
+ * Given an integer `x`, return such `y` that `x <= y` and `is_a_power_of_two(y)`.
+ * If such `y` does not exist in `T`, return zero.
+ */
+template <typename T>
+constexpr inline auto bound_by_power_of_two(T x) noexcept
+  -> std::enable_if_t<std::is_integral<T>::value, T>
+{
+  if (is_a_power_of_two(x)) { return x; }
+  constexpr T kMaxUnsafe = std::numeric_limits<T>::max();
+  constexpr T kMaxSafe   = is_a_power_of_two(kMaxUnsafe) ? kMaxUnsafe : (kMaxUnsafe >> 1);
+  const T limited        = std::min(x, kMaxSafe);
+  T bound                = T{1};
+  while (bound < limited) {
+    bound <<= 1;
+  }
+  return bound < x ? T{0} : bound;
 }
 
 /**
@@ -150,13 +170,13 @@ constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_o
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T val)
+constexpr inline auto absolute_value(T val) -> std::enable_if_t<std::is_signed<T>::value, T>
 {
   return std::abs(val);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T val)
+constexpr inline auto absolute_value(T val) -> std::enable_if_t<!std::is_signed<T>::value, T>
 {
   return val;
 }
diff --git a/cpp/include/raft_runtime/cluster/kmeans.hpp b/cpp/include/raft_runtime/cluster/kmeans.hpp
index 241f5758e8..3386774414 100644
--- a/cpp/include/raft_runtime/cluster/kmeans.hpp
+++ b/cpp/include/raft_runtime/cluster/kmeans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
 
@@ -28,7 +28,7 @@ namespace raft::runtime::cluster::kmeans {
  * @{
  */
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const float* X,
                       int n_samples,
                       int n_features,
@@ -39,7 +39,7 @@ void update_centroids(raft::handle_t const& handle,
                       float* new_centroids,
                       float* weight_per_cluster);
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const double* X,
                       int n_samples,
                       int n_features,
@@ -50,7 +50,7 @@ void update_centroids(raft::handle_t const& handle,
                       double* new_centroids,
                       double* weight_per_cluster);
 
-void fit(handle_t const& handle,
+void fit(raft::device_resources const& handle,
          const raft::cluster::kmeans::KMeansParams& params,
          raft::device_matrix_view<const float, int, row_major> X,
          std::optional<raft::device_vector_view<const float, int>> sample_weight,
@@ -58,7 +58,7 @@ void fit(handle_t const& handle,
          raft::host_scalar_view<float, int> inertia,
          raft::host_scalar_view<int, int> n_iter);
 
-void fit(handle_t const& handle,
+void fit(raft::device_resources const& handle,
          const raft::cluster::kmeans::KMeansParams& params,
          raft::device_matrix_view<const double, int, row_major> X,
          std::optional<raft::device_vector_view<const double, int>> sample_weight,
@@ -66,7 +66,7 @@ void fit(handle_t const& handle,
          raft::host_scalar_view<double, int> inertia,
          raft::host_scalar_view<int, int> n_iter);
 
-void cluster_cost(raft::handle_t const& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const float* X,
                   int n_samples,
                   int n_features,
@@ -74,7 +74,7 @@ void cluster_cost(raft::handle_t const& handle,
                   const float* centroids,
                   float* cost);
 
-void cluster_cost(raft::handle_t const& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const double* X,
                   int n_samples,
                   int n_features,
diff --git a/cpp/include/raft_runtime/distance/fused_l2_nn.hpp b/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
index 46c9aa0b43..bdac3723e2 100644
--- a/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft_runtime/distance/fused_l2_nn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
 namespace raft::runtime::distance {
@@ -42,7 +42,7 @@ namespace raft::runtime::distance {
  * @param[in]  k             gemm k
  * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
  */
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const float* x,
                          const float* y,
@@ -51,7 +51,7 @@ void fused_l2_nn_min_arg(raft::handle_t const& handle,
                          int k,
                          bool sqrt);
 
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const double* x,
                          const double* y,
diff --git a/cpp/include/raft_runtime/distance/pairwise_distance.hpp b/cpp/include/raft_runtime/distance/pairwise_distance.hpp
index 02dca87e43..751f821ffb 100644
--- a/cpp/include/raft_runtime/distance/pairwise_distance.hpp
+++ b/cpp/include/raft_runtime/distance/pairwise_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace raft::runtime::distance {
  * @{
  */
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        float* x,
                        float* y,
                        float* dists,
@@ -34,7 +34,7 @@ void pairwise_distance(raft::handle_t const& handle,
                        bool isRowMajor,
                        float metric_arg);
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        double* x,
                        double* y,
                        double* dists,
diff --git a/cpp/include/raft_runtime/neighbors/ivf_pq.hpp b/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
index 7956e41497..cae32c9530 100644
--- a/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
+++ b/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 namespace raft::runtime::neighbors::ivf_pq {
 
 #define RAFT_INST_SEARCH(T, IdxT)                            \
-  void search(const handle_t&,                               \
+  void search(raft::device_resources const&,                 \
               const raft::neighbors::ivf_pq::search_params&, \
               const raft::neighbors::ivf_pq::index<IdxT>&,   \
               const T*,                                      \
@@ -41,28 +41,28 @@ RAFT_INST_SEARCH(uint8_t, uint64_t);
 // wrappers, where exception handling is not compatible with return type that has nontrivial
 // constructor.
 #define RAFT_INST_BUILD_EXTEND(T, IdxT)                               \
-  auto build(const handle_t& handle,                                  \
+  auto build(raft::device_resources const& handle,                    \
              const raft::neighbors::ivf_pq::index_params& params,     \
              const T* dataset,                                        \
              IdxT n_rows,                                             \
              uint32_t dim)                                            \
     ->raft::neighbors::ivf_pq::index<IdxT>;                           \
                                                                       \
-  auto extend(const handle_t& handle,                                 \
+  auto extend(raft::device_resources const& handle,                   \
               const raft::neighbors::ivf_pq::index<IdxT>& orig_index, \
               const T* new_vectors,                                   \
               const IdxT* new_indices,                                \
               IdxT n_rows)                                            \
     ->raft::neighbors::ivf_pq::index<IdxT>;                           \
                                                                       \
-  void build(const handle_t& handle,                                  \
+  void build(raft::device_resources const& handle,                    \
              const raft::neighbors::ivf_pq::index_params& params,     \
              const T* dataset,                                        \
              IdxT n_rows,                                             \
              uint32_t dim,                                            \
              raft::neighbors::ivf_pq::index<IdxT>* idx);              \
                                                                       \
-  void extend(const handle_t& handle,                                 \
+  void extend(raft::device_resources const& handle,                   \
               raft::neighbors::ivf_pq::index<IdxT>* idx,              \
               const T* new_vectors,                                   \
               const IdxT* new_indices,                                \
@@ -84,7 +84,7 @@ RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t)
  * @param[in] index IVF-PQ index
  *
  */
-void save(const handle_t& handle,
+void save(raft::device_resources const& handle,
           const std::string& filename,
           const raft::neighbors::ivf_pq::index<uint64_t>& index);
 
@@ -98,7 +98,7 @@ void save(const handle_t& handle,
  * @param[in] index IVF-PQ index
  *
  */
-void load(const handle_t& handle,
+void load(raft::device_resources const& handle,
           const std::string& filename,
           raft::neighbors::ivf_pq::index<uint64_t>* index);
 
diff --git a/cpp/include/raft_runtime/neighbors/refine.hpp b/cpp/include/raft_runtime/neighbors/refine.hpp
index 2e08ae23c0..e779d17ded 100644
--- a/cpp/include/raft_runtime/neighbors/refine.hpp
+++ b/cpp/include/raft_runtime/neighbors/refine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,13 +17,13 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 //#include <raft/core/host_mdspan.hpp>
 
 namespace raft::runtime::neighbors {
 
 #define RAFT_INST_REFINE(IDX_T, DATA_T)                                                       \
-  void refine(raft::handle_t const& handle,                                                   \
+  void refine(raft::device_resources const& handle,                                           \
               raft::device_matrix_view<const DATA_T, uint64_t, row_major> dataset,            \
               raft::device_matrix_view<const DATA_T, uint64_t, row_major> queries,            \
               raft::device_matrix_view<const IDX_T, uint64_t, row_major> neighbor_candidates, \
@@ -31,7 +31,7 @@ namespace raft::runtime::neighbors {
               raft::device_matrix_view<float, uint64_t, row_major> distances,                 \
               distance::DistanceType metric);                                                 \
                                                                                               \
-  void refine(raft::handle_t const& handle,                                                   \
+  void refine(raft::device_resources const& handle,                                           \
               raft::host_matrix_view<const DATA_T, uint64_t, row_major> dataset,              \
               raft::host_matrix_view<const DATA_T, uint64_t, row_major> queries,              \
               raft::host_matrix_view<const IDX_T, uint64_t, row_major> neighbor_candidates,   \
diff --git a/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp b/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp
index 1dc1199864..8f18fd1388 100644
--- a/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp
+++ b/cpp/include/raft_runtime/random/rmat_rectangular_generator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <cstdint>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng_state.hpp>
 
 namespace raft::runtime::random {
@@ -26,15 +26,15 @@ namespace raft::runtime::random {
  * @{
  */
 
-#define FUNC_DECL(IdxT, ProbT)                            \
-  void rmat_rectangular_gen(raft::handle_t const& handle, \
-                            IdxT* out,                    \
-                            IdxT* out_src,                \
-                            IdxT* out_dst,                \
-                            const ProbT* theta,           \
-                            IdxT r_scale,                 \
-                            IdxT c_scale,                 \
-                            IdxT n_edges,                 \
+#define FUNC_DECL(IdxT, ProbT)                                    \
+  void rmat_rectangular_gen(raft::device_resources const& handle, \
+                            IdxT* out,                            \
+                            IdxT* out_src,                        \
+                            IdxT* out_dst,                        \
+                            const ProbT* theta,                   \
+                            IdxT r_scale,                         \
+                            IdxT c_scale,                         \
+                            IdxT n_edges,                         \
                             raft::random::RngState& r)
 
 FUNC_DECL(int, float);
diff --git a/cpp/internal/CMakeLists.txt b/cpp/internal/CMakeLists.txt
new file mode 100644
index 0000000000..4d5c585c01
--- /dev/null
+++ b/cpp/internal/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+if(BUILD_TESTS OR BUILD_BENCH)
+  add_library(raft_internal INTERFACE)
+  target_include_directories(
+    raft_internal INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/internal>"
+  )
+  target_compile_features(raft_internal INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+endif()
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
new file mode 100644
index 0000000000..205149b821
--- /dev/null
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <raft/core/device_resources.hpp>
+
+namespace raft::matrix::select {
+
+struct params {
+  size_t batch_size;
+  size_t len;
+  int k;
+  bool select_min;
+  bool use_index_input = true;
+};
+
+inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
+{
+  os << "params{batch_size: " << ss.batch_size;
+  os << ", len: " << ss.len;
+  os << ", k: " << ss.k;
+  os << (ss.select_min ? ", asc" : ", dsc");
+  os << (ss.use_index_input ? "}" : ", no-input-index}");
+  return os;
+}
+
+enum class Algo {
+  kPublicApi,
+  kRadix8bits,
+  kRadix11bits,
+  kWarpAuto,
+  kWarpImmediate,
+  kWarpFiltered,
+  kWarpDistributed,
+  kWarpDistributedShm
+};
+
+inline auto operator<<(std::ostream& os, const Algo& algo) -> std::ostream&
+{
+  switch (algo) {
+    case Algo::kPublicApi: return os << "kPublicApi";
+    case Algo::kRadix8bits: return os << "kRadix8bits";
+    case Algo::kRadix11bits: return os << "kRadix11bits";
+    case Algo::kWarpAuto: return os << "kWarpAuto";
+    case Algo::kWarpImmediate: return os << "kWarpImmediate";
+    case Algo::kWarpFiltered: return os << "kWarpFiltered";
+    case Algo::kWarpDistributed: return os << "kWarpDistributed";
+    case Algo::kWarpDistributedShm: return os << "kWarpDistributedShm";
+    default: return os << "unknown enum value";
+  }
+}
+
+template <typename T, typename IdxT>
+void select_k_impl(const device_resources& handle,
+                   const Algo& algo,
+                   const T* in,
+                   const IdxT* in_idx,
+                   size_t batch_size,
+                   size_t len,
+                   int k,
+                   T* out,
+                   IdxT* out_idx,
+                   bool select_min)
+{
+  auto stream = handle.get_stream();
+  switch (algo) {
+    case Algo::kPublicApi: {
+      auto in_extent   = make_extents<size_t>(batch_size, len);
+      auto out_extent  = make_extents<size_t>(batch_size, k);
+      auto in_span     = make_mdspan<const T, size_t, row_major, false, true>(in, in_extent);
+      auto in_idx_span = make_mdspan<const IdxT, size_t, row_major, false, true>(in_idx, in_extent);
+      auto out_span    = make_mdspan<T, size_t, row_major, false, true>(out, out_extent);
+      auto out_idx_span = make_mdspan<IdxT, size_t, row_major, false, true>(out_idx, out_extent);
+      if (in_idx == nullptr) {
+        // NB: std::nullopt prevents automatic inference of the template parameters.
+        return matrix::select_k<T, IdxT>(
+          handle, in_span, std::nullopt, out_span, out_idx_span, select_min);
+      } else {
+        return matrix::select_k(
+          handle, in_span, std::make_optional(in_idx_span), out_span, out_idx_span, select_min);
+      }
+    }
+    case Algo::kRadix8bits:
+      return detail::select::radix::select_k<T, IdxT, 8, 512>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kRadix11bits:
+      return detail::select::radix::select_k<T, IdxT, 11, 512>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpAuto:
+      return detail::select::warpsort::select_k<T, IdxT>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpImmediate:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpFiltered:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpDistributed:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpDistributedShm:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  }
+}
+
+}  // namespace raft::matrix::select
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
new file mode 100644
index 0000000000..3ad055272b
--- /dev/null
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::neighbors {
+
+template <typename EvalT, typename DataT, typename IdxT>
+__global__ void naive_distance_kernel(EvalT* dist,
+                                      const DataT* x,
+                                      const DataT* y,
+                                      IdxT m,
+                                      IdxT n,
+                                      IdxT k,
+                                      raft::distance::DistanceType metric)
+{
+  IdxT midx = IdxT(threadIdx.x) + IdxT(blockIdx.x) * IdxT(blockDim.x);
+  if (midx >= m) return;
+  IdxT grid_size = IdxT(blockDim.y) * IdxT(gridDim.y);
+  for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; nidx += grid_size) {
+    EvalT acc = EvalT(0);
+    for (IdxT i = 0; i < k; ++i) {
+      IdxT xidx = i + midx * k;
+      IdxT yidx = i + nidx * k;
+      auto xv   = EvalT(x[xidx]);
+      auto yv   = EvalT(y[yidx]);
+      switch (metric) {
+        case raft::distance::DistanceType::InnerProduct: {
+          acc += xv * yv;
+        } break;
+        case raft::distance::DistanceType::L2SqrtExpanded:
+        case raft::distance::DistanceType::L2SqrtUnexpanded:
+        case raft::distance::DistanceType::L2Expanded:
+        case raft::distance::DistanceType::L2Unexpanded: {
+          auto diff = xv - yv;
+          acc += diff * diff;
+        } break;
+        default: break;
+      }
+    }
+    switch (metric) {
+      case raft::distance::DistanceType::L2SqrtExpanded:
+      case raft::distance::DistanceType::L2SqrtUnexpanded: {
+        acc = raft::sqrt(acc);
+      } break;
+      default: break;
+    }
+    dist[midx * n + nidx] = acc;
+  }
+}
+
+/**
+ * Naive, but flexible bruteforce KNN search.
+ *
+ * TODO: either replace this with brute_force_knn or with distance+select_k
+ *       when either distance or brute_force_knn support 8-bit int inputs.
+ */
+template <typename EvalT, typename DataT, typename IdxT>
+void naive_knn(EvalT* dist_topk,
+               IdxT* indices_topk,
+               const DataT* x,
+               const DataT* y,
+               size_t n_inputs,
+               size_t input_len,
+               size_t dim,
+               uint32_t k,
+               raft::distance::DistanceType type,
+               rmm::cuda_stream_view stream)
+{
+  rmm::mr::device_memory_resource* mr = nullptr;
+  auto pool_guard                     = raft::get_pool_memory_resource(mr, 1024 * 1024);
+
+  dim3 block_dim(16, 32, 1);
+  // maximum reasonable grid size in `y` direction
+  auto grid_y =
+    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
+
+  // bound the memory used by this function
+  size_t max_batch_size =
+    std::min<size_t>(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
+  rmm::device_uvector<EvalT> dist(max_batch_size * input_len, stream, mr);
+
+  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
+    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
+    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
+
+    naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
+      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
+
+    matrix::detail::select_k<EvalT, IdxT>(dist.data(),
+                                          nullptr,
+                                          batch_size,
+                                          input_len,
+                                          static_cast<int>(k),
+                                          dist_topk + offset * k,
+                                          indices_topk + offset * k,
+                                          type != raft::distance::DistanceType::InnerProduct,
+                                          stream,
+                                          mr);
+  }
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+
+}  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/refine_helper.cuh b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
similarity index 72%
rename from cpp/test/neighbors/refine_helper.cuh
rename to cpp/internal/raft_internal/neighbors/refine_helper.cuh
index 3c69a8f5b7..5e26222827 100644
--- a/cpp/test/neighbors/refine_helper.cuh
+++ b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,19 +15,20 @@
  */
 #pragma once
 
-#include "ann_utils.cuh"
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
-namespace raft::neighbors::detail {
+namespace raft::neighbors {
 
 template <typename IdxT>
 struct RefineInputs {
@@ -44,7 +45,7 @@ struct RefineInputs {
 template <typename DataT, typename DistanceT, typename IdxT>
 class RefineHelper {
  public:
-  RefineHelper(const raft::handle_t& handle, RefineInputs<IdxT> params)
+  RefineHelper(const raft::device_resources& handle, RefineInputs<IdxT> params)
     : handle_(handle), stream_(handle.get_stream()), p(params)
   {
     raft::random::Rng r(1234ULL);
@@ -66,16 +67,16 @@ class RefineHelper {
     {
       candidates = raft::make_device_matrix<IdxT, IdxT>(handle_, p.n_queries, p.k0);
       rmm::device_uvector<DistanceT> distances_tmp(p.n_queries * p.k0, stream_);
-      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_tmp.data(),
-                                                          candidates.data_handle(),
-                                                          queries.data_handle(),
-                                                          dataset.data_handle(),
-                                                          p.n_queries,
-                                                          p.n_rows,
-                                                          p.dim,
-                                                          p.k0,
-                                                          p.metric,
-                                                          stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_tmp.data(),
+                                        candidates.data_handle(),
+                                        queries.data_handle(),
+                                        dataset.data_handle(),
+                                        p.n_queries,
+                                        p.n_rows,
+                                        p.dim,
+                                        p.k0,
+                                        p.metric,
+                                        stream_);
       handle_.sync_stream(stream_);
     }
 
@@ -98,16 +99,16 @@ class RefineHelper {
     {
       rmm::device_uvector<DistanceT> distances_dev(p.n_queries * p.k, stream_);
       rmm::device_uvector<IdxT> indices_dev(p.n_queries * p.k, stream_);
-      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_dev.data(),
-                                                          indices_dev.data(),
-                                                          queries.data_handle(),
-                                                          dataset.data_handle(),
-                                                          p.n_queries,
-                                                          p.n_rows,
-                                                          p.dim,
-                                                          p.k,
-                                                          p.metric,
-                                                          stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_dev.data(),
+                                        indices_dev.data(),
+                                        queries.data_handle(),
+                                        dataset.data_handle(),
+                                        p.n_queries,
+                                        p.n_rows,
+                                        p.dim,
+                                        p.k,
+                                        p.metric,
+                                        stream_);
       true_refined_distances_host.resize(p.n_queries * p.k);
       true_refined_indices_host.resize(p.n_queries * p.k);
       raft::copy(true_refined_indices_host.data(), indices_dev.data(), indices_dev.size(), stream_);
@@ -119,7 +120,7 @@ class RefineHelper {
 
  public:
   RefineInputs<IdxT> p;
-  const raft::handle_t& handle_;
+  const raft::device_resources& handle_;
   rmm::cuda_stream_view stream_;
 
   raft::device_matrix<DataT, IdxT, row_major> dataset;
@@ -137,4 +138,4 @@ class RefineHelper {
   std::vector<IdxT> true_refined_indices_host;
   std::vector<DistanceT> true_refined_distances_host;
 };
-}  // namespace raft::neighbors::detail
\ No newline at end of file
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/cluster/cluster_cost.cuh b/cpp/src/distance/cluster/cluster_cost.cuh
index 4f208ab1cd..be7fa521aa 100644
--- a/cpp/src/distance/cluster/cluster_cost.cuh
+++ b/cpp/src/distance/cluster/cluster_cost.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,15 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
-#include <raft/handle.hpp>
 #include <raft/util/cuda_utils.cuh>
 
 namespace raft::runtime::cluster::kmeans {
 template <typename ElementType, typename IndexType>
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const ElementType* X,
                   IndexType n_samples,
                   IndexType n_features,
diff --git a/cpp/src/distance/cluster/cluster_cost_double.cu b/cpp/src/distance/cluster/cluster_cost_double.cu
index 8fffa4eee0..6feb3076dd 100644
--- a/cpp/src/distance/cluster/cluster_cost_double.cu
+++ b/cpp/src/distance/cluster/cluster_cost_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "cluster_cost.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::cluster::kmeans {
 
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const double* X,
                   int n_samples,
                   int n_features,
diff --git a/cpp/src/distance/cluster/cluster_cost_float.cu b/cpp/src/distance/cluster/cluster_cost_float.cu
index 7f2cd9a485..24af5efb25 100644
--- a/cpp/src/distance/cluster/cluster_cost_float.cu
+++ b/cpp/src/distance/cluster/cluster_cost_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "cluster_cost.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::cluster::kmeans {
 
-void cluster_cost(const raft::handle_t& handle,
+void cluster_cost(raft::device_resources const& handle,
                   const float* X,
                   int n_samples,
                   int n_features,
diff --git a/cpp/src/distance/cluster/kmeans_fit_double.cu b/cpp/src/distance/cluster/kmeans_fit_double.cu
index 2989941d85..cbc9fa45cb 100644
--- a/cpp/src/distance/cluster/kmeans_fit_double.cu
+++ b/cpp/src/distance/cluster/kmeans_fit_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,12 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::cluster::kmeans {
 
-void fit(handle_t const& handle,
+void fit(raft::device_resources const& handle,
          const raft::cluster::kmeans::KMeansParams& params,
          raft::device_matrix_view<const double, int> X,
          std::optional<raft::device_vector_view<const double, int>> sample_weight,
diff --git a/cpp/src/distance/cluster/kmeans_fit_float.cu b/cpp/src/distance/cluster/kmeans_fit_float.cu
index 0e10ab30a4..6dcbd73b8d 100644
--- a/cpp/src/distance/cluster/kmeans_fit_float.cu
+++ b/cpp/src/distance/cluster/kmeans_fit_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,12 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::cluster::kmeans {
 
-void fit(handle_t const& handle,
+void fit(raft::device_resources const& handle,
          const raft::cluster::kmeans::KMeansParams& params,
          raft::device_matrix_view<const float, int> X,
          std::optional<raft::device_vector_view<const float, int>> sample_weight,
diff --git a/cpp/src/distance/cluster/update_centroids.cuh b/cpp/src/distance/cluster/update_centroids.cuh
index 035f597523..e0d3bd8487 100644
--- a/cpp/src/distance/cluster/update_centroids.cuh
+++ b/cpp/src/distance/cluster/update_centroids.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,15 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 #include <raft/linalg/norm.cuh>
 
 namespace raft::runtime::cluster::kmeans {
 
 template <typename DataT, typename IndexT>
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const DataT* X,
                       int n_samples,
                       int n_features,
diff --git a/cpp/src/distance/cluster/update_centroids_double.cu b/cpp/src/distance/cluster/update_centroids_double.cu
index 85c4953150..cb63de2ca5 100644
--- a/cpp/src/distance/cluster/update_centroids_double.cu
+++ b/cpp/src/distance/cluster/update_centroids_double.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "update_centroids.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::cluster::kmeans {
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const double* X,
                       int n_samples,
                       int n_features,
diff --git a/cpp/src/distance/cluster/update_centroids_float.cu b/cpp/src/distance/cluster/update_centroids_float.cu
index aaad2bc3e4..7ce74b584c 100644
--- a/cpp/src/distance/cluster/update_centroids_float.cu
+++ b/cpp/src/distance/cluster/update_centroids_float.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,13 @@
  */
 
 #include "update_centroids.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::cluster::kmeans {
 
-void update_centroids(raft::handle_t const& handle,
+void update_centroids(raft::device_resources const& handle,
                       const float* X,
                       int n_samples,
                       int n_features,
diff --git a/cpp/src/distance/distance/fused_l2_min_arg.cu b/cpp/src/distance/distance/fused_l2_min_arg.cu
index f74b7c0e86..b682446cc2 100644
--- a/cpp/src/distance/distance/fused_l2_min_arg.cu
+++ b/cpp/src/distance/distance/fused_l2_min_arg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/kvp.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
@@ -35,7 +35,7 @@ struct KeyValueIndexOp {
 };
 
 template <typename value_t, typename idx_t>
-void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
+void compute_fused_l2_nn_min_arg(raft::device_resources const& handle,
                                  idx_t* min,
                                  const value_t* x,
                                  const value_t* y,
@@ -71,7 +71,7 @@ void compute_fused_l2_nn_min_arg(raft::handle_t const& handle,
   handle.sync_stream();
 }
 
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const float* x,
                          const float* y,
@@ -83,7 +83,7 @@ void fused_l2_nn_min_arg(raft::handle_t const& handle,
   compute_fused_l2_nn_min_arg<float, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
-void fused_l2_nn_min_arg(raft::handle_t const& handle,
+void fused_l2_nn_min_arg(raft::device_resources const& handle,
                          int* min,
                          const double* x,
                          const double* y,
diff --git a/cpp/src/distance/distance/pairwise_distance.cu b/cpp/src/distance/distance/pairwise_distance.cu
index cbdd5cd8f2..dfdfa553e9 100644
--- a/cpp/src/distance/distance/pairwise_distance.cu
+++ b/cpp/src/distance/distance/pairwise_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/specializations.cuh>
-#include <raft/handle.hpp>
 
 namespace raft::runtime::distance {
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        float* x,
                        float* y,
                        float* dists,
@@ -36,7 +36,7 @@ void pairwise_distance(raft::handle_t const& handle,
     handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
 }
 
-void pairwise_distance(raft::handle_t const& handle,
+void pairwise_distance(raft::device_resources const& handle,
                        double* x,
                        double* y,
                        double* dists,
diff --git a/cpp/src/distance/neighbors/ivfpq_build.cu b/cpp/src/distance/neighbors/ivfpq_build.cu
index bbcd0533f8..650767f918 100644
--- a/cpp/src/distance/neighbors/ivfpq_build.cu
+++ b/cpp/src/distance/neighbors/ivfpq_build.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 namespace raft::runtime::neighbors::ivf_pq {
 
 #define RAFT_INST_BUILD_EXTEND(T, IdxT)                                                      \
-  auto build(const handle_t& handle,                                                         \
+  auto build(raft::device_resources const& handle,                                           \
              const raft::neighbors::ivf_pq::index_params& params,                            \
              const T* dataset,                                                               \
              IdxT n_rows,                                                                    \
@@ -29,7 +29,7 @@ namespace raft::runtime::neighbors::ivf_pq {
   {                                                                                          \
     return raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset, n_rows, dim);    \
   }                                                                                          \
-  auto extend(const handle_t& handle,                                                        \
+  auto extend(raft::device_resources const& handle,                                          \
               const raft::neighbors::ivf_pq::index<IdxT>& orig_index,                        \
               const T* new_vectors,                                                          \
               const IdxT* new_indices,                                                       \
@@ -40,7 +40,7 @@ namespace raft::runtime::neighbors::ivf_pq {
       handle, orig_index, new_vectors, new_indices, n_rows);                                 \
   }                                                                                          \
                                                                                              \
-  void build(const handle_t& handle,                                                         \
+  void build(raft::device_resources const& handle,                                           \
              const raft::neighbors::ivf_pq::index_params& params,                            \
              const T* dataset,                                                               \
              IdxT n_rows,                                                                    \
@@ -49,7 +49,7 @@ namespace raft::runtime::neighbors::ivf_pq {
   {                                                                                          \
     *idx = raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset, n_rows, dim);    \
   }                                                                                          \
-  void extend(const handle_t& handle,                                                        \
+  void extend(raft::device_resources const& handle,                                          \
               raft::neighbors::ivf_pq::index<IdxT>* idx,                                     \
               const T* new_vectors,                                                          \
               const IdxT* new_indices,                                                       \
@@ -64,14 +64,14 @@ RAFT_INST_BUILD_EXTEND(uint8_t, uint64_t);
 
 #undef RAFT_INST_BUILD_EXTEND
 
-void save(const handle_t& handle,
+void save(raft::device_resources const& handle,
           const std::string& filename,
           const raft::neighbors::ivf_pq::index<uint64_t>& index)
 {
   raft::spatial::knn::ivf_pq::detail::save(handle, filename, index);
 };
 
-void load(const handle_t& handle,
+void load(raft::device_resources const& handle,
           const std::string& filename,
           raft::neighbors::ivf_pq::index<uint64_t>* index)
 {
diff --git a/cpp/src/distance/neighbors/ivfpq_search.cu b/cpp/src/distance/neighbors/ivfpq_search.cu
index 3bd1f6ed68..05ab890ea5 100644
--- a/cpp/src/distance/neighbors/ivfpq_search.cu
+++ b/cpp/src/distance/neighbors/ivfpq_search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 namespace raft::runtime::neighbors::ivf_pq {
 
 #define RAFT_SEARCH_INST(T, IdxT)                                            \
-  void search(const handle_t& handle,                                        \
+  void search(raft::device_resources const& handle,                          \
               const raft::neighbors::ivf_pq::search_params& params,          \
               const raft::neighbors::ivf_pq::index<IdxT>& idx,               \
               const T* queries,                                              \
diff --git a/cpp/src/distance/neighbors/refine.cu b/cpp/src/distance/neighbors/refine.cu
deleted file mode 100644
index 82e7ebe5ff..0000000000
--- a/cpp/src/distance/neighbors/refine.cu
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-
-namespace raft::runtime::neighbors {
-
-#define RAFT_INST_REFINE(IDX_T, DATA_T)                                                       \
-  void refine(raft::handle_t const& handle,                                                   \
-              raft::device_matrix_view<const DATA_T, uint64_t, row_major> dataset,            \
-              raft::device_matrix_view<const DATA_T, uint64_t, row_major> queries,            \
-              raft::device_matrix_view<const IDX_T, uint64_t, row_major> neighbor_candidates, \
-              raft::device_matrix_view<IDX_T, uint64_t, row_major> indices,                   \
-              raft::device_matrix_view<float, uint64_t, row_major> distances,                 \
-              distance::DistanceType metric)                                                  \
-  {                                                                                           \
-    raft::neighbors::detail::refine_device<IDX_T, DATA_T, float, uint64_t>(                   \
-      handle, dataset, queries, neighbor_candidates, indices, distances, metric);             \
-  }                                                                                           \
-                                                                                              \
-  void refine(raft::handle_t const& handle,                                                   \
-              raft::host_matrix_view<const DATA_T, uint64_t, row_major> dataset,              \
-              raft::host_matrix_view<const DATA_T, uint64_t, row_major> queries,              \
-              raft::host_matrix_view<const IDX_T, uint64_t, row_major> neighbor_candidates,   \
-              raft::host_matrix_view<IDX_T, uint64_t, row_major> indices,                     \
-              raft::host_matrix_view<float, uint64_t, row_major> distances,                   \
-              distance::DistanceType metric)                                                  \
-  {                                                                                           \
-    raft::neighbors::detail::refine_host<IDX_T, DATA_T, float, uint64_t>(                     \
-      dataset, queries, neighbor_candidates, indices, distances, metric);                     \
-  }
-
-RAFT_INST_REFINE(uint64_t, float);
-RAFT_INST_REFINE(uint64_t, uint8_t);
-RAFT_INST_REFINE(uint64_t, int8_t);
-
-#undef RAFT_INST_REFINE
-
-}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_d_uint64_t_float.cu b/cpp/src/distance/neighbors/refine_d_uint64_t_float.cu
new file mode 100644
index 0000000000..d7b460180a
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_d_uint64_t_float.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const float, uint64_t, row_major> dataset,
+            raft::device_matrix_view<const float, uint64_t, row_major> queries,
+            raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::device_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, float, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_d_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/refine_d_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..3db07f0cdb
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_d_uint64_t_int8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const int8_t, uint64_t, row_major> dataset,
+            raft::device_matrix_view<const int8_t, uint64_t, row_major> queries,
+            raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::device_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, int8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_d_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/refine_d_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..2ce43d5800
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_d_uint64_t_uint8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+            raft::device_matrix_view<const uint8_t, uint64_t, row_major> queries,
+            raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::device_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, uint8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_h_uint64_t_float.cu b/cpp/src/distance/neighbors/refine_h_uint64_t_float.cu
new file mode 100644
index 0000000000..2a2dcff3bf
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_h_uint64_t_float.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const float, uint64_t, row_major> dataset,
+            raft::host_matrix_view<const float, uint64_t, row_major> queries,
+            raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::host_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, float, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_h_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/refine_h_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..d7c60b62a5
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_h_uint64_t_int8_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const int8_t, uint64_t, row_major> dataset,
+            raft::host_matrix_view<const int8_t, uint64_t, row_major> queries,
+            raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::host_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, int8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/refine_h_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/refine_h_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..e9c4345e97
--- /dev/null
+++ b/cpp/src/distance/neighbors/refine_h_uint64_t_uint8_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+#include <raft/neighbors/specializations/refine.cuh>
+
+namespace raft::runtime::neighbors {
+
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+            raft::host_matrix_view<const uint8_t, uint64_t, row_major> queries,
+            raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+            raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+            raft::host_matrix_view<float, uint64_t, row_major> distances,
+            distance::DistanceType metric)
+{
+  raft::neighbors::refine<uint64_t, uint8_t, float, uint64_t>(
+    handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+}  // namespace raft::runtime::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
index a32147b2b1..34825b253b 100644
--- a/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
-template void search<float, int64_t>(const handle_t&,
+template void search<float, int64_t>(const raft::device_resources&,
                                      const search_params&,
                                      const index<int64_t>&,
                                      const float*,
diff --git a/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
index f3e80206e4..ec746dc434 100644
--- a/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
+++ b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
-template void search<float, uint32_t>(const handle_t&,
+template void search<float, uint32_t>(const raft::device_resources&,
                                       const search_params&,
                                       const index<uint32_t>&,
                                       const float*,
diff --git a/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
index e732646f99..ea18fac24d 100644
--- a/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
+++ b/cpp/src/distance/neighbors/specializations/detail/ivfpq_search_float_uint64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace raft::spatial::knn::ivf_pq::detail {
 
-template void search<float, uint64_t>(const handle_t&,
+template void search<float, uint64_t>(const raft::device_resources&,
                                       const search_params&,
                                       const index<uint64_t>&,
                                       const float*,
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_float.cu b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_float.cu
new file mode 100644
index 0000000000..6bb1985d94
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_float.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, float, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const float, uint64_t, row_major> dataset,
+  raft::device_matrix_view<const float, uint64_t, row_major> queries,
+  raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::device_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..7e70ee5e29
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_int8_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, int8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const int8_t, uint64_t, row_major> dataset,
+  raft::device_matrix_view<const int8_t, uint64_t, row_major> queries,
+  raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::device_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..53de106ef9
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_d_uint64_t_uint8_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, uint8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+  raft::device_matrix_view<const uint8_t, uint64_t, row_major> queries,
+  raft::device_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::device_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::device_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_float.cu b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_float.cu
new file mode 100644
index 0000000000..b473924741
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_float.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, float, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::host_matrix_view<const float, uint64_t, row_major> dataset,
+  raft::host_matrix_view<const float, uint64_t, row_major> queries,
+  raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::host_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu
new file mode 100644
index 0000000000..c8b0e4c1c2
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_int8_t.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+template void refine<uint64_t, int8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::host_matrix_view<const int8_t, uint64_t, row_major> dataset,
+  raft::host_matrix_view<const int8_t, uint64_t, row_major> queries,
+  raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::host_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu
new file mode 100644
index 0000000000..b9e0f58ef6
--- /dev/null
+++ b/cpp/src/distance/neighbors/specializations/refine_h_uint64_t_uint8_t.cu
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors {
+
+template void refine<uint64_t, uint8_t, float, uint64_t>(
+  raft::device_resources const& handle,
+  raft::host_matrix_view<const uint8_t, uint64_t, row_major> dataset,
+  raft::host_matrix_view<const uint8_t, uint64_t, row_major> queries,
+  raft::host_matrix_view<const uint64_t, uint64_t, row_major> neighbor_candidates,
+  raft::host_matrix_view<uint64_t, uint64_t, row_major> indices,
+  raft::host_matrix_view<float, uint64_t, row_major> distances,
+  distance::DistanceType metric);
+
+}  // namespace raft::neighbors
diff --git a/cpp/src/distance/random/common.cuh b/cpp/src/distance/random/common.cuh
index 3bd57fc80d..69b507b07b 100644
--- a/cpp/src/distance/random/common.cuh
+++ b/cpp/src/distance/random/common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <raft_runtime/random/rmat_rectangular_generator.hpp>
 
 #define FUNC_DEF(IdxT, ProbT)                                                           \
-  void rmat_rectangular_gen(raft::handle_t const& handle,                               \
+  void rmat_rectangular_gen(raft::device_resources const& handle,                       \
                             IdxT* out,                                                  \
                             IdxT* out_src,                                              \
                             IdxT* out_dst,                                              \
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover.cu
index b608a1a865..f37fda31af 100644
--- a/cpp/src/nn/specializations/ball_cover.cu
+++ b/cpp/src/nn/specializations/ball_cover.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,11 +33,11 @@ template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
 template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
 
 template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
 
 template void knn_query<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
   const float* query,
@@ -48,7 +48,7 @@ template void knn_query<std::int64_t, float, std::uint32_t>(
   float weight);
 
 template void all_knn_query<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
   std::uint32_t k,
   std::int64_t* inds,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
index 961af0b89c..cff83ad3cf 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
index daa509b5b1..a12d6548ed 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
@@ -38,7 +38,7 @@ template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
   std::uint32_t* dists_counter);
 
 template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
index 9487641945..7e784cb4d8 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
index c07ed45427..e650c7ed37 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace knn {
 namespace detail {
 
 template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  const raft::handle_t& handle,
+  raft::device_resources const& handle,
   const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
   const std::uint32_t n_query_rows,
diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu
index 4e0a821c24..d135610bfb 100644
--- a/cpp/src/nn/specializations/knn.cu
+++ b/cpp/src/nn/specializations/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
-template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
+template void brute_force_knn<long, float, int>(raft::device_resources const& handle,
                                                 std::vector<float*>& input,
                                                 std::vector<int>& sizes,
                                                 int D,
@@ -36,7 +36,7 @@ template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
                                                 distance::DistanceType metric,
                                                 float metric_arg);
 
-template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& handle,
+template void brute_force_knn<long, float, unsigned int>(raft::device_resources const& handle,
                                                          std::vector<float*>& input,
                                                          std::vector<unsigned int>& sizes,
                                                          unsigned int D,
@@ -51,7 +51,7 @@ template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& h
                                                          distance::DistanceType metric,
                                                          float metric_arg);
 
-template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle,
+template void brute_force_knn<uint32_t, float, int>(raft::device_resources const& handle,
                                                     std::vector<float*>& input,
                                                     std::vector<int>& sizes,
                                                     int D,
@@ -66,7 +66,7 @@ template void brute_force_knn<uint32_t, float, int>(raft::handle_t const& handle
                                                     distance::DistanceType metric,
                                                     float metric_arg);
 
-template void brute_force_knn<uint32_t, float, unsigned int>(raft::handle_t const& handle,
+template void brute_force_knn<uint32_t, float, unsigned int>(raft::device_resources const& handle,
                                                              std::vector<float*>& input,
                                                              std::vector<unsigned int>& sizes,
                                                              unsigned int D,
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 5be8401a6f..af8bf844df 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -32,6 +32,7 @@ function(ConfigureTest)
   target_link_libraries(
     ${TEST_NAME}
     PRIVATE raft::raft
+            raft_internal
             $<$<BOOL:${ConfigureTest_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureTest_NN}>:raft::nn>
             GTest::gtest
@@ -77,25 +78,27 @@ endfunction()
 
 if(BUILD_TESTS)
   ConfigureTest(
-    NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster_solvers.cu test/cluster/linkage.cu
-    OPTIONAL DIST NN
+    NAME CLUSTER_TEST PATH test/cluster/kmeans.cu test/cluster/kmeans_balanced.cu
+    test/cluster/cluster_solvers.cu test/cluster/linkage.cu OPTIONAL DIST NN
   )
 
   ConfigureTest(
     NAME
     CORE_TEST
     PATH
-    test/common/logger.cpp
+    test/core/logger.cpp
+    test/core/math_device.cu
+    test/core/math_host.cpp
     test/core/operators_device.cu
     test/core/operators_host.cpp
-    test/handle.cpp
-    test/interruptible.cu
-    test/nvtx.cpp
-    test/mdarray.cu
-    test/mdspan_utils.cu
-    test/memory_type.cpp
-    test/span.cpp
-    test/span.cu
+    test/core/handle.cpp
+    test/core/interruptible.cu
+    test/core/nvtx.cpp
+    test/core/mdarray.cu
+    test/core/mdspan_utils.cu
+    test/core/memory_type.cpp
+    test/core/span.cpp
+    test/core/span.cu
     test/test.cpp
   )
 
@@ -119,6 +122,8 @@ if(BUILD_TESTS)
     test/distance/dist_minkowski.cu
     test/distance/dist_russell_rao.cu
     test/distance/fused_l2_nn.cu
+    test/distance/masked_nn.cu
+    test/distance/masked_nn_compress_to_bits.cu
     test/distance/gram.cu
     OPTIONAL
     DIST
@@ -177,9 +182,10 @@ if(BUILD_TESTS)
     test/matrix/matrix.cu
     test/matrix/norm.cu
     test/matrix/reverse.cu
+    test/matrix/select_k.cu
     test/matrix/slice.cu
     test/matrix/triangular.cu
-    test/spectral_matrix.cu
+    test/sparse/spectral_matrix.cu
   )
 
   ConfigureTest(
@@ -198,8 +204,8 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SOLVERS_TEST PATH test/cluster_solvers_deprecated.cu test/eigen_solvers.cu test/lap/lap.cu
-    test/mst.cu OPTIONAL DIST
+    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
+    test/lap/lap.cu test/sparse/mst.cu OPTIONAL DIST
   )
 
   ConfigureTest(
@@ -290,7 +296,7 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME UTILS_TEST PATH test/common/seive.cu test/cudart_utils.cpp test/device_atomics.cu
-    test/integer_utils.cpp test/pow2_utils.cu
+    NAME UTILS_TEST PATH test/core/seive.cu test/util/bitonic_sort.cu test/util/cudart_utils.cpp
+    test/util/device_atomics.cu test/util/integer_utils.cpp test/util/pow2_utils.cu
   )
 endif()
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster/cluster_solvers.cu
similarity index 93%
rename from cpp/test/cluster_solvers.cu
rename to cpp/test/cluster/cluster_solvers.cu
index 26fbfec011..5121cdf139 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster/cluster_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
 #include <raft/spectral/specializations.cuh>
@@ -35,7 +35,7 @@ TEST(Raft, ClusterSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
 
   index_type maxiter{100};
   value_type tol{1.0e-10};
@@ -65,13 +65,8 @@ TEST(Raft, ModularitySolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
-  ASSERT_EQ(0,
-            h.
-
-            get_device()
-
-  );
+  raft::device_resources h;
+  ASSERT_EQ(0, h.get_device());
 
   index_type neigvs{10};
   index_type maxiter{100};
diff --git a/cpp/test/cluster_solvers_deprecated.cu b/cpp/test/cluster/cluster_solvers_deprecated.cu
similarity index 92%
rename from cpp/test/cluster_solvers_deprecated.cu
rename to cpp/test/cluster/cluster_solvers_deprecated.cu
index 167a710b34..dbafbd15d6 100644
--- a/cpp/test/cluster_solvers_deprecated.cu
+++ b/cpp/test/cluster/cluster_solvers_deprecated.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/spectral/cluster_solvers_deprecated.cuh>
 
@@ -30,7 +30,7 @@ TEST(Raft, ClusterSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
 
   index_type maxiter{100};
   value_type tol{1.0e-10};
diff --git a/cpp/test/cluster/kmeans.cu b/cpp/test/cluster/kmeans.cu
index 9644541a0c..685bd1f965 100644
--- a/cpp/test/cluster/kmeans.cu
+++ b/cpp/test/cluster/kmeans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <raft/cluster/kmeans.cuh>
 #include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/random/make_blobs.cuh>
 #include <raft/stats/adjusted_rand_index.cuh>
@@ -29,8 +29,8 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/cluster/specializations.cuh>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
 #endif
 
 namespace raft {
@@ -45,7 +45,7 @@ struct KmeansInputs {
 };
 
 template <typename DataT, typename IndexT>
-void run_cluster_cost(const raft::handle_t& handle,
+void run_cluster_cost(const raft::device_resources& handle,
                       raft::device_vector_view<DataT, IndexT> minClusterDistance,
                       rmm::device_uvector<char>& workspace,
                       raft::device_scalar_view<DataT> clusterCost)
@@ -58,11 +58,10 @@ template <typename T>
 class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
  protected:
   KmeansTest()
-    : stream(handle.get_stream()),
-      d_labels(0, stream),
-      d_labels_ref(0, stream),
-      d_centroids(0, stream),
-      d_sample_weight(0, stream)
+    : d_labels(0, handle.get_stream()),
+      d_labels_ref(0, handle.get_stream()),
+      d_centroids(0, handle.get_stream()),
+      d_sample_weight(0, handle.get_stream())
   {
   }
 
@@ -70,6 +69,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
   {
     testparams = ::testing::TestWithParam<KmeansInputs<T>>::GetParam();
 
+    auto stream                = handle.get_stream();
     int n_samples              = testparams.n_row;
     int n_features             = testparams.n_col;
     params.n_clusters          = testparams.n_clusters;
@@ -112,8 +112,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
     rmm::device_uvector<char> workspace(0, stream);
     rmm::device_uvector<T> L2NormBuf_OR_DistBuf(0, stream);
     rmm::device_uvector<T> inRankCp(0, stream);
-    auto X_view =
-      raft::make_device_matrix_view<const T, int>(X.data_handle(), X.extent(0), X.extent(1));
+    auto X_view = raft::make_const_mdspan(X.view());
     auto centroids_view =
       raft::make_device_matrix_view<T, int>(d_centroids.data(), params.n_clusters, n_features);
     auto miniX = raft::make_device_matrix<T, int>(handle, n_samples / 4, n_features);
@@ -126,12 +125,8 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
       miniX.extent(0),
       params.rng_state.seed);
 
-    raft::cluster::kmeans::init_plus_plus(handle,
-                                          params,
-                                          raft::make_device_matrix_view<const T, int>(
-                                            miniX.data_handle(), miniX.extent(0), miniX.extent(1)),
-                                          centroids_view,
-                                          workspace);
+    raft::cluster::kmeans::init_plus_plus(
+      handle, params, raft::make_const_mdspan(miniX.view()), centroids_view, workspace);
 
     auto minClusterDistance = raft::make_device_vector<T, int>(handle, n_samples);
     auto minClusterAndDistance =
@@ -249,6 +244,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
 
     auto X      = raft::make_device_matrix<T, int>(handle, n_samples, n_features);
     auto labels = raft::make_device_vector<int, int>(handle, n_samples);
+    auto stream = handle.get_stream();
 
     raft::random::make_blobs<T, int>(X.data_handle(),
                                      labels.data_handle(),
@@ -284,10 +280,9 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
 
     raft::copy(d_labels_ref.data(), labels.data_handle(), n_samples, stream);
 
-    T inertia  = 0;
-    int n_iter = 0;
-    auto X_view =
-      raft::make_device_matrix_view<const T, int>(X.data_handle(), X.extent(0), X.extent(1));
+    T inertia   = 0;
+    int n_iter  = 0;
+    auto X_view = raft::make_const_mdspan(X.view());
 
     raft::cluster::kmeans_fit_predict<T, int>(
       handle,
@@ -322,8 +317,7 @@ class KmeansTest : public ::testing::TestWithParam<KmeansInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  raft::device_resources handle;
   KmeansInputs<T> testparams;
   rmm::device_uvector<int> d_labels;
   rmm::device_uvector<int> d_labels_ref;
diff --git a/cpp/test/cluster/kmeans_balanced.cu b/cpp/test/cluster/kmeans_balanced.cu
new file mode 100644
index 0000000000..028819563e
--- /dev/null
+++ b/cpp/test/cluster/kmeans_balanced.cu
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <optional>
+#include <vector>
+
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/stats/adjusted_rand_index.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+#include <thrust/fill.h>
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+/* This test takes advantage of the fact that make_blobs generates balanced clusters.
+ * It doesn't currently test whether the algorithm can make balanced clusters with an imbalanced
+ * dataset.
+ */
+
+namespace raft {
+
+template <typename MathT, typename IdxT>
+struct KmeansBalancedInputs {
+  IdxT n_rows;
+  IdxT n_cols;
+  IdxT n_clusters;
+  raft::cluster::kmeans_balanced_params kb_params;
+  MathT tol;
+};
+
+template <typename MathT, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const KmeansBalancedInputs<MathT, IdxT>& p)
+{
+  os << "{ " << p.n_rows << ", " << p.n_cols << ", " << p.n_clusters << ", " << p.kb_params.n_iters
+     << static_cast<int>(p.kb_params.metric) << '}' << std::endl;
+  return os;
+}
+
+template <typename DataT, typename MathT, typename LabelT, typename IdxT, typename MappingOpT>
+class KmeansBalancedTest : public ::testing::TestWithParam<KmeansBalancedInputs<MathT, IdxT>> {
+ protected:
+  KmeansBalancedTest()
+    : stream(handle.get_stream()),
+      d_labels(0, stream),
+      d_labels_ref(0, stream),
+      d_centroids(0, stream)
+  {
+  }
+
+  void basicTest()
+  {
+    MappingOpT op{};
+
+    auto p = ::testing::TestWithParam<KmeansBalancedInputs<MathT, IdxT>>::GetParam();
+
+    auto X           = raft::make_device_matrix<DataT, IdxT>(handle, p.n_rows, p.n_cols);
+    auto blob_labels = raft::make_device_vector<IdxT, IdxT>(handle, p.n_rows);
+
+    MathT* blobs_ptr;
+    rmm::device_uvector<MathT> blobs(0, stream);
+    if constexpr (!std::is_same_v<DataT, MathT>) {
+      blobs.resize(p.n_rows * p.n_cols, stream);
+      blobs_ptr = blobs.data();
+    } else {
+      blobs_ptr = X.data_handle();
+    }
+
+    raft::random::make_blobs<MathT, IdxT>(blobs_ptr,
+                                          blob_labels.data_handle(),
+                                          p.n_rows,
+                                          p.n_cols,
+                                          p.n_clusters,
+                                          stream,
+                                          true,
+                                          nullptr,
+                                          nullptr,
+                                          MathT{0.1},
+                                          true,
+                                          MathT{-1},
+                                          MathT{1},
+                                          (uint64_t)1234);
+
+    // Convert blobs dataset to DataT if necessary
+    if constexpr (!std::is_same_v<DataT, MathT>) {
+      raft::linalg::unaryOp(
+        X.data_handle(), blobs.data(), p.n_rows * p.n_cols, op.reverse_op, stream);
+    }
+
+    d_labels.resize(p.n_rows, stream);
+    d_labels_ref.resize(p.n_rows, stream);
+    d_centroids.resize(p.n_clusters * p.n_cols, stream);
+
+    raft::linalg::unaryOp(
+      d_labels_ref.data(), blob_labels.data_handle(), p.n_rows, raft::cast_op<LabelT>(), stream);
+
+    auto X_view =
+      raft::make_device_matrix_view<const DataT, IdxT>(X.data_handle(), X.extent(0), X.extent(1));
+    auto d_centroids_view =
+      raft::make_device_matrix_view<MathT, IdxT>(d_centroids.data(), p.n_clusters, p.n_cols);
+    auto d_labels_view = raft::make_device_vector_view<LabelT, IdxT>(d_labels.data(), p.n_rows);
+
+    raft::cluster::kmeans_balanced::fit_predict(
+      handle, p.kb_params, X_view, d_centroids_view, d_labels_view, op);
+
+    handle.sync_stream(stream);
+
+    score = raft::stats::adjusted_rand_index(
+      d_labels_ref.data(), d_labels.data(), p.n_rows, handle.get_stream());
+
+    if (score < 1.0) {
+      std::stringstream ss;
+      ss << "Expected: " << raft::arr2Str(d_labels_ref.data(), 25, "d_labels_ref", stream);
+      std::cout << (ss.str().c_str()) << '\n';
+      ss.str(std::string());
+      ss << "Actual: " << raft::arr2Str(d_labels.data(), 25, "d_labels", stream);
+      std::cout << (ss.str().c_str()) << '\n';
+      std::cout << "Score = " << score << '\n';
+    }
+  }
+
+  void SetUp() override { basicTest(); }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+  rmm::device_uvector<LabelT> d_labels;
+  rmm::device_uvector<LabelT> d_labels_ref;
+  rmm::device_uvector<MathT> d_centroids;
+  double score;
+};
+
+template <typename MathT, typename IdxT>
+std::vector<KmeansBalancedInputs<MathT, IdxT>> get_kmeans_balanced_inputs()
+{
+  std::vector<KmeansBalancedInputs<MathT, IdxT>> out;
+  KmeansBalancedInputs<MathT, IdxT> p;
+  p.kb_params.n_iters = 20;
+  p.kb_params.metric  = raft::distance::DistanceType::L2Expanded;
+  p.tol               = MathT{0.0001};
+  std::vector<std::tuple<size_t, size_t, size_t>> row_cols_k = {{1000, 32, 5},
+                                                                {1000, 100, 20},
+                                                                {10000, 32, 10},
+                                                                {10000, 100, 50},
+                                                                {10000, 500, 100},
+                                                                {1000000, 128, 10}};
+  for (auto& rck : row_cols_k) {
+    p.n_rows     = static_cast<IdxT>(std::get<0>(rck));
+    p.n_cols     = static_cast<IdxT>(std::get<1>(rck));
+    p.n_clusters = static_cast<IdxT>(std::get<2>(rck));
+    out.push_back(p);
+  }
+  return out;
+}
+
+const auto inputsf_i32 = get_kmeans_balanced_inputs<float, int>();
+const auto inputsd_i32 = get_kmeans_balanced_inputs<double, int>();
+const auto inputsf_i64 = get_kmeans_balanced_inputs<float, int64_t>();
+const auto inputsd_i64 = get_kmeans_balanced_inputs<double, int64_t>();
+
+#define KB_TEST(test_type, test_name, test_inputs)         \
+  typedef RAFT_DEPAREN(test_type) test_name;               \
+  TEST_P(test_name, Result) { ASSERT_TRUE(score == 1.0); } \
+  INSTANTIATE_TEST_CASE_P(KmeansBalancedTests, test_name, ::testing::ValuesIn(test_inputs))
+
+/*
+ * First set of tests: no conversion
+ */
+
+KB_TEST((KmeansBalancedTest<float, float, uint32_t, int, raft::identity_op>),
+        KmeansBalancedTestFFU32I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<double, double, uint32_t, int, raft::identity_op>),
+        KmeansBalancedTestDDU32I32,
+        inputsd_i32);
+KB_TEST((KmeansBalancedTest<float, float, uint32_t, int64_t, raft::identity_op>),
+        KmeansBalancedTestFFU32I64,
+        inputsf_i64);
+KB_TEST((KmeansBalancedTest<double, double, uint32_t, int64_t, raft::identity_op>),
+        KmeansBalancedTestDDU32I64,
+        inputsd_i64);
+KB_TEST((KmeansBalancedTest<float, float, int, int, raft::identity_op>),
+        KmeansBalancedTestFFI32I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<float, float, int, int64_t, raft::identity_op>),
+        KmeansBalancedTestFFI32I64,
+        inputsf_i64);
+KB_TEST((KmeansBalancedTest<float, float, int64_t, int, raft::identity_op>),
+        KmeansBalancedTestFFI64I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<float, float, int64_t, int64_t, raft::identity_op>),
+        KmeansBalancedTestFFI64I64,
+        inputsf_i64);
+
+/*
+ * Second set of tests: integer dataset with conversion
+ */
+
+template <typename DataT, typename MathT>
+struct i2f_scaler {
+  // Note: with a scaling factor of 42, and generating blobs with centers between -1 and 1 with a
+  // standard deviation of 0.1, it's statistically very unlikely that we'd overflow
+  const raft::compose_op<raft::div_const_op<MathT>, raft::cast_op<MathT>> op{
+    raft::div_const_op<MathT>{42}, raft::cast_op<MathT>{}};
+  const raft::compose_op<raft::cast_op<DataT>, raft::mul_const_op<MathT>> reverse_op{
+    raft::cast_op<DataT>{}, raft::mul_const_op<MathT>{42}};
+
+  RAFT_INLINE_FUNCTION auto operator()(const DataT& x) const { return op(x); };
+};
+
+KB_TEST((KmeansBalancedTest<int8_t, float, uint32_t, int, i2f_scaler<int8_t, float>>),
+        KmeansBalancedTestFI8U32I32,
+        inputsf_i32);
+KB_TEST((KmeansBalancedTest<int8_t, double, uint32_t, int, i2f_scaler<int8_t, double>>),
+        KmeansBalancedTestDI8U32I32,
+        inputsd_i32);
+
+}  // namespace raft
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 53aa5c55e3..20f2952e7d 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,6 +49,8 @@ struct LinkageInputs {
 
   int n_clusters;
 
+  bool use_knn;
+
   int c;
 };
 
@@ -162,15 +164,18 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
  public:
   LinkageTest()
     : params(::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam()),
-      stream(handle.get_stream()),
-      labels(params.n_row, stream),
-      labels_ref(params.n_row, stream)
+      labels(0, handle.get_stream()),
+      labels_ref(0, handle.get_stream())
   {
   }
 
  protected:
   void basicTest()
   {
+    auto stream = handle.get_stream();
+
+    labels.resize(params.n_row, stream);
+    labels_ref.resize(params.n_row, stream);
     rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
     raft::copy(data.data(), params.data.data(), data.size(), stream);
@@ -178,23 +183,34 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
 
     rmm::device_uvector<IdxT> out_children(params.n_row * 2, stream);
 
-    raft::handle_t handle;
-
     auto data_view = raft::make_device_matrix_view<const T, IdxT, row_major>(
       data.data(), params.n_row, params.n_col);
     auto dendrogram_view =
       raft::make_device_matrix_view<IdxT, IdxT, row_major>(out_children.data(), params.n_row, 2);
     auto labels_view = raft::make_device_vector_view<IdxT, IdxT>(labels.data(), params.n_row);
 
-    raft::cluster::hierarchy::
-      single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::KNN_GRAPH>(
-        handle,
-        data_view,
-        dendrogram_view,
-        labels_view,
-        raft::distance::DistanceType::L2SqrtExpanded,
-        params.n_clusters,
-        std::make_optional<int>(params.c));
+    if (params.use_knn) {
+      raft::cluster::hierarchy::
+        single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::KNN_GRAPH>(
+          handle,
+          data_view,
+          dendrogram_view,
+          labels_view,
+          raft::distance::DistanceType::L2SqrtExpanded,
+          params.n_clusters,
+          std::make_optional<int>(params.c));
+
+    } else {
+      raft::cluster::hierarchy::
+        single_linkage<T, IdxT, raft::cluster::hierarchy::LinkageDistance::PAIRWISE>(
+          handle,
+          data_view,
+          dendrogram_view,
+          labels_view,
+          raft::distance::DistanceType::L2SqrtExpanded,
+          params.n_clusters,
+          std::make_optional<int>(params.c));
+    }
 
     handle.sync_stream(stream);
 
@@ -204,8 +220,7 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
   void SetUp() override { basicTest(); }
 
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  raft::device_resources handle;
 
   LinkageInputs<T, IdxT> params;
   rmm::device_uvector<IdxT> labels, labels_ref;
@@ -225,6 +240,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
     0.76166195, 0.66613745},
    {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
    10,
+   true,
    -1},
   //  // Test outlier points
   {9,
@@ -232,6 +248,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
    {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
    {6, 0, 5, 0, 0, 4, 3, 2, 1},
    7,
+   true,
    -1},
 
   // Test n_clusters == (n_points / 2)
@@ -246,6 +263,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
     0.76166195, 0.66613745},
    {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
    5,
+   true,
    -1},
 
   // Test n_points == 100
@@ -425,7 +443,224 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
     0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
    10,
-   -4}};
+   true,
+   -4},
+  {10,
+   5,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+    0.76166195, 0.66613745},
+   {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+   10,
+   false,
+   5},
+  // Test outlier points
+  {9,
+   2,
+   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
+   {6, 0, 5, 0, 0, 4, 3, 2, 1},
+   7,
+   false,
+   5},
+
+  // Test n_clusters == (n_points / 2)
+  {10,
+   5,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+    0.76166195, 0.66613745},
+   {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
+   5,
+   false,
+   10},
+
+  // Test n_points == 100
+  {100,
+   10,
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
+
+   },
+   {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+   10,
+   false,
+   5}};
 
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
new file mode 100644
index 0000000000..9f416d3ae8
--- /dev/null
+++ b/cpp/test/core/handle.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <unordered_map>
+
+namespace raft {
+
+using namespace comms;
+class mock_comms : public comms_iface {
+ public:
+  mock_comms(int n) : n_ranks(n) {}
+  ~mock_comms() {}
+
+  int get_size() const override { return n_ranks; }
+
+  int get_rank() const override { return 0; }
+
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    return std::unique_ptr<comms_iface>(new mock_comms(0));
+  }
+
+  void barrier() const {}
+
+  void get_request_id(request_t* req) const {}
+
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const {}
+
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const {}
+
+  void waitall(int count, request_t array_of_requests[]) const {}
+
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+  }
+
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const {}
+
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+  }
+
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+  }
+
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+  }
+
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+  }
+
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
+  }
+
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
+  }
+
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+  }
+
+  status_t sync_stream(cudaStream_t stream) const { return status_t::SUCCESS; }
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const {}
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const {}
+
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+  }
+
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+  }
+
+  void group_start() const {}
+
+  void group_end() const {}
+
+ private:
+  int n_ranks;
+};
+
+void assert_handles_equal(raft::handle_t& handle_one, raft::handle_t& handle_two)
+{
+  // Assert shallow copied state
+  ASSERT_EQ(handle_one.get_stream().value(), handle_two.get_stream().value());
+  ASSERT_EQ(handle_one.get_stream_pool_size(), handle_two.get_stream_pool_size());
+
+  // Sanity check to make sure non-corresponding streams are not equal
+  ASSERT_NE(handle_one.get_stream_pool().get_stream(0).value(),
+            handle_two.get_stream_pool().get_stream(1).value());
+
+  for (size_t i = 0; i < handle_one.get_stream_pool_size(); ++i) {
+    ASSERT_EQ(handle_one.get_stream_pool().get_stream(i).value(),
+              handle_two.get_stream_pool().get_stream(i).value());
+  }
+}
+
+TEST(Raft, HandleDefault)
+{
+  raft::handle_t h;
+  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream());
+  ASSERT_NE(nullptr, h.get_cublas_handle());
+  ASSERT_NE(nullptr, h.get_cusolver_dn_handle());
+  ASSERT_NE(nullptr, h.get_cusolver_sp_handle());
+  ASSERT_NE(nullptr, h.get_cusparse_handle());
+}
+
+TEST(Raft, Handle)
+{
+  // test stream pool creation
+  constexpr std::size_t n_streams = 4;
+  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
+  raft::handle_t h(rmm::cuda_stream_default, stream_pool);
+  ASSERT_EQ(n_streams, h.get_stream_pool_size());
+
+  // test non default stream handle
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  rmm::cuda_stream_view stream_view(stream);
+  raft::handle_t handle(stream_view);
+  ASSERT_EQ(stream_view, handle.get_stream());
+  handle.sync_stream(stream);
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+TEST(Raft, DefaultConstructor)
+{
+  raft::handle_t handle;
+
+  // Make sure waiting on the default stream pool
+  // does not fail.
+  handle.wait_stream_pool_on_stream();
+  handle.sync_stream_pool();
+
+  auto s1 = handle.get_next_usable_stream();
+  auto s2 = handle.get_stream();
+  auto s3 = handle.get_next_usable_stream(5);
+
+  ASSERT_EQ(s1, s2);
+  ASSERT_EQ(s2, s3);
+  ASSERT_EQ(0, handle.get_stream_pool_size());
+}
+
+TEST(Raft, GetHandleFromPool)
+{
+  constexpr std::size_t n_streams = 4;
+  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
+  raft::handle_t parent(rmm::cuda_stream_default, stream_pool);
+
+  for (std::size_t i = 0; i < n_streams; i++) {
+    auto worker_stream = parent.get_stream_from_stream_pool(i);
+    raft::handle_t child(worker_stream);
+    ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream());
+  }
+
+  parent.wait_stream_pool_on_stream();
+}
+
+TEST(Raft, Comms)
+{
+  raft::handle_t handle;
+  auto comm1 = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mock_comms(2)));
+  handle.set_comms(comm1);
+
+  ASSERT_EQ(handle.get_comms().get_size(), 2);
+}
+
+TEST(Raft, SubComms)
+{
+  raft::handle_t handle;
+  auto comm1 = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mock_comms(1)));
+  handle.set_subcomm("key1", comm1);
+
+  auto comm2 = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mock_comms(2)));
+  handle.set_subcomm("key2", comm2);
+
+  ASSERT_EQ(handle.get_subcomm("key1").get_size(), 1);
+  ASSERT_EQ(handle.get_subcomm("key2").get_size(), 2);
+}
+
+TEST(Raft, WorkspaceResource)
+{
+  raft::handle_t handle;
+
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                handle.get_workspace_resource()) == nullptr);
+  ASSERT_EQ(rmm::mr::get_current_device_resource(), handle.get_workspace_resource());
+
+  auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource());
+  std::shared_ptr<rmm::cuda_stream_pool> pool = {nullptr};
+  raft::handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr);
+
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                handle2.get_workspace_resource()) != nullptr);
+  ASSERT_EQ(pool_mr, handle2.get_workspace_resource());
+
+  delete pool_mr;
+}
+
+TEST(Raft, WorkspaceResourceCopy)
+{
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+
+  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+
+  auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource());
+
+  handle_t copied_handle(handle, pool_mr);
+
+  assert_handles_equal(handle, copied_handle);
+
+  // Assert the workspace_resources are what we expect
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                handle.get_workspace_resource()) == nullptr);
+
+  ASSERT_TRUE(dynamic_cast<const rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(
+                copied_handle.get_workspace_resource()) != nullptr);
+}
+
+TEST(Raft, HandleCopy)
+{
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+
+  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+  handle_t copied_handle(handle);
+
+  assert_handles_equal(handle, copied_handle);
+}
+
+TEST(Raft, HandleAssign)
+{
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(10);
+
+  handle_t handle(rmm::cuda_stream_per_thread, stream_pool);
+  handle_t copied_handle = handle;
+
+  assert_handles_equal(handle, copied_handle);
+}
+
+}  // namespace raft
diff --git a/cpp/test/interruptible.cu b/cpp/test/core/interruptible.cu
similarity index 98%
rename from cpp/test/interruptible.cu
rename to cpp/test/core/interruptible.cu
index 92adfabd55..f54bb6f859 100644
--- a/cpp/test/interruptible.cu
+++ b/cpp/test/core/interruptible.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/common/logger.cpp b/cpp/test/core/logger.cpp
similarity index 98%
rename from cpp/test/common/logger.cpp
rename to cpp/test/core/logger.cpp
index a8460e45ca..3f29c9f12c 100644
--- a/cpp/test/common/logger.cpp
+++ b/cpp/test/core/logger.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu
new file mode 100644
index 0000000000..ff4b343d9e
--- /dev/null
+++ b/cpp/test/core/math_device.cu
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.h"
+#include <raft/core/math.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_scalar.hpp>
+
+template <typename OutT, typename OpT, typename... Args>
+__global__ void math_eval_kernel(OutT* out, OpT op, Args... args)
+{
+  out[0] = op(std::forward<Args>(args)...);
+}
+
+template <typename OpT, typename... Args>
+auto math_eval(OpT op, Args&&... args)
+{
+  typedef decltype(op(args...)) OutT;
+  auto stream = rmm::cuda_stream_default;
+  rmm::device_scalar<OutT> result(stream);
+  math_eval_kernel<<<1, 1, 0, stream>>>(result.data(), op, std::forward<Args>(args)...);
+  return result.value(stream);
+}
+
+struct abs_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::abs(in);
+  }
+};
+
+TEST(MathDevice, Abs)
+{
+  // Integer abs
+  ASSERT_TRUE(
+    raft::match(int8_t{123}, math_eval(abs_test_op{}, int8_t{-123}), raft::Compare<int8_t>()));
+  ASSERT_TRUE(raft::match(12345, math_eval(abs_test_op{}, -12345), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(12345l, math_eval(abs_test_op{}, -12345l), raft::Compare<long int>()));
+  ASSERT_TRUE(raft::match(123451234512345ll,
+                          math_eval(abs_test_op{}, -123451234512345ll),
+                          raft::Compare<long long int>()));
+  // Floating-point abs
+  ASSERT_TRUE(
+    raft::match(12.34f, math_eval(abs_test_op{}, -12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(12.34, math_eval(abs_test_op{}, -12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct acos_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::acos(in);
+  }
+};
+
+TEST(MathDevice, Acos)
+{
+  ASSERT_TRUE(raft::match(
+    std::acos(0.123f), math_eval(acos_test_op{}, 0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::acos(0.123), math_eval(acos_test_op{}, 0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+struct asin_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::asin(in);
+  }
+};
+
+TEST(MathDevice, Asin)
+{
+  ASSERT_TRUE(raft::match(
+    std::asin(0.123f), math_eval(asin_test_op{}, 0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::asin(0.123), math_eval(asin_test_op{}, 0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+struct atanh_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::atanh(in);
+  }
+};
+
+TEST(MathDevice, Atanh)
+{
+  ASSERT_TRUE(raft::match(
+    std::atanh(0.123f), math_eval(atanh_test_op{}, 0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::atanh(0.123), math_eval(atanh_test_op{}, 0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+struct cos_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::cos(in);
+  }
+};
+
+TEST(MathDevice, Cos)
+{
+  ASSERT_TRUE(raft::match(
+    std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::cos(12.34), math_eval(cos_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct exp_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::exp(in);
+  }
+};
+
+TEST(MathDevice, Exp)
+{
+  ASSERT_TRUE(raft::match(
+    std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct log_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::log(in);
+  }
+};
+
+TEST(MathDevice, Log)
+{
+  ASSERT_TRUE(raft::match(
+    std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::log(12.34), math_eval(log_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct max_test_op {
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return raft::max(std::forward<Args>(args)...);
+  }
+};
+
+TEST(MathDevice, Max2)
+{
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, -1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::match(1234u, math_eval(max_test_op{}, 1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(
+    raft::match(1234ll, math_eval(max_test_op{}, -1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(raft::match(
+    1234ull, math_eval(max_test_op{}, 1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(
+    raft::match(12.34f, math_eval(max_test_op{}, -12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34, math_eval(max_test_op{}, -12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, -12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathDevice, Max3)
+{
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, 1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, -1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, math_eval(max_test_op{}, 0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, 12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, -12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    12.34, math_eval(max_test_op{}, 0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+struct min_test_op {
+  template <typename... Args>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(Args&&... args) const
+  {
+    return raft::min(std::forward<Args>(args)...);
+  }
+};
+
+TEST(MathDevice, Min2)
+{
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, -1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::match(123u, math_eval(min_test_op{}, 1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(raft::match(
+    -1234ll, math_eval(min_test_op{}, -1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(raft::match(
+    123ull, math_eval(min_test_op{}, 1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(
+    raft::match(-12.34f, math_eval(min_test_op{}, -12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(
+    raft::match(-12.34, math_eval(min_test_op{}, -12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, -12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathDevice, Min3)
+{
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, 1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, -1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, math_eval(min_test_op{}, 0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, 12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, -12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(
+    -12.34, math_eval(min_test_op{}, 0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+struct pow_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& x, const Type& y) const
+  {
+    return raft::pow(x, y);
+  }
+};
+
+TEST(MathDevice, Pow)
+{
+  ASSERT_TRUE(raft::match(std::pow(12.34f, 2.f),
+                          math_eval(pow_test_op{}, 12.34f, 2.f),
+                          raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::pow(12.34, 2.),
+                          math_eval(pow_test_op{}, 12.34, 2.),
+                          raft::CompareApprox<double>(0.000001)));
+}
+
+struct sgn_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::sgn(in);
+  }
+};
+
+TEST(MathDevice, Sgn)
+{
+  ASSERT_TRUE(raft::match(-1, math_eval(sgn_test_op{}, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, math_eval(sgn_test_op{}, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1, math_eval(sgn_test_op{}, -12.34f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, math_eval(sgn_test_op{}, 0.f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare<int>()));
+}
+
+struct sin_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::sin(in);
+  }
+};
+
+TEST(MathDevice, Sin)
+{
+  ASSERT_TRUE(raft::match(
+    std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::sin(12.34), math_eval(sin_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct sincos_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& x, Type* s, Type* c) const
+  {
+    raft::sincos(x, s, c);
+    return x;  // unused, just to avoid creating another helper
+  }
+};
+
+TEST(MathDevice, SinCos)
+{
+  auto stream = rmm::cuda_stream_default;
+  float xf    = 12.34f;
+  rmm::device_scalar<float> sf(stream);
+  rmm::device_scalar<float> cf(stream);
+  math_eval(sincos_test_op{}, xf, sf.data(), cf.data());
+  ASSERT_TRUE(raft::match(std::sin(12.34f), sf.value(stream), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34f), cf.value(stream), raft::CompareApprox<float>(0.0001f)));
+  double xd = 12.34f;
+  rmm::device_scalar<double> sd(stream);
+  rmm::device_scalar<double> cd(stream);
+  math_eval(sincos_test_op{}, xd, sd.data(), cd.data());
+  ASSERT_TRUE(raft::match(std::sin(12.34), sd.value(stream), raft::CompareApprox<double>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox<double>(0.0001f)));
+}
+
+struct sqrt_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::sqrt(in);
+  }
+};
+
+TEST(MathDevice, Sqrt)
+{
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::sqrt(12.34), math_eval(sqrt_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+struct tanh_test_op {
+  template <typename Type>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const
+  {
+    return raft::tanh(in);
+  }
+};
+
+TEST(MathDevice, Tanh)
+{
+  ASSERT_TRUE(raft::match(
+    std::tanh(12.34f), math_eval(tanh_test_op{}, 12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(
+    std::tanh(12.34), math_eval(tanh_test_op{}, 12.34), raft::CompareApprox<double>(0.000001)));
+}
diff --git a/cpp/test/core/math_host.cpp b/cpp/test/core/math_host.cpp
new file mode 100644
index 0000000000..5808905713
--- /dev/null
+++ b/cpp/test/core/math_host.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.h"
+#include <raft/core/math.hpp>
+
+TEST(MathHost, Abs)
+{
+  // Integer abs
+  ASSERT_TRUE(raft::match(int8_t{123}, raft::abs(int8_t{-123}), raft::Compare<int8_t>()));
+  ASSERT_TRUE(raft::match(12345, raft::abs(-12345), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(12345l, raft::abs(-12345l), raft::Compare<long int>()));
+  ASSERT_TRUE(
+    raft::match(123451234512345ll, raft::abs(-123451234512345ll), raft::Compare<long long int>()));
+  // Floating-point abs
+  ASSERT_TRUE(raft::match(12.34f, raft::abs(-12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(12.34, raft::abs(-12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Acos)
+{
+  ASSERT_TRUE(
+    raft::match(std::acos(0.123f), raft::acos(0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::acos(0.123), raft::acos(0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Asin)
+{
+  ASSERT_TRUE(
+    raft::match(std::asin(0.123f), raft::asin(0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::asin(0.123), raft::asin(0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Atanh)
+{
+  ASSERT_TRUE(
+    raft::match(std::atanh(0.123f), raft::atanh(0.123f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::atanh(0.123), raft::atanh(0.123), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Cos)
+{
+  ASSERT_TRUE(
+    raft::match(std::cos(12.34f), raft::cos(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::cos(12.34), raft::cos(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Exp)
+{
+  ASSERT_TRUE(
+    raft::match(std::exp(12.34f), raft::exp(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::exp(12.34), raft::exp(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Log)
+{
+  ASSERT_TRUE(
+    raft::match(std::log(12.34f), raft::log(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::log(12.34), raft::log(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Max2)
+{
+  ASSERT_TRUE(raft::match(1234, raft::max(-1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234u, raft::max(1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(raft::match(1234ll, raft::max(-1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(
+    raft::match(1234ull, raft::max(1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(raft::match(12.34f, raft::max(-12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(12.34, raft::max(-12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(raft::match(12.34, raft::max(-12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(12.34, raft::max(-12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Max3)
+{
+  ASSERT_TRUE(raft::match(1234, raft::max(1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, raft::max(-1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1234, raft::max(0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(
+    raft::match(12.34, raft::max(12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(12.34, raft::max(-12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(12.34, raft::max(0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Min2)
+{
+  ASSERT_TRUE(raft::match(-1234, raft::min(-1234, 1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(123u, raft::min(1234u, 123u), raft::Compare<unsigned int>()));
+  ASSERT_TRUE(raft::match(-1234ll, raft::min(-1234ll, 1234ll), raft::Compare<long long int>()));
+  ASSERT_TRUE(
+    raft::match(123ull, raft::min(1234ull, 123ull), raft::Compare<unsigned long long int>()));
+
+  ASSERT_TRUE(raft::match(-12.34f, raft::min(-12.34f, 12.34f), raft::Compare<float>()));
+  ASSERT_TRUE(raft::match(-12.34, raft::min(-12.34, 12.34), raft::Compare<double>()));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(-12.34f, 12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(-12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Min3)
+{
+  ASSERT_TRUE(raft::match(-1234, raft::min(1234, 0, -1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, raft::min(-1234, 1234, 0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1234, raft::min(0, -1234, 1234), raft::Compare<int>()));
+
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(12.34f, 0., -12.34), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(-12.34, 12.34f, 0.), raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(
+    raft::match(-12.34, raft::min(0., -12.34, 12.34f), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Pow)
+{
+  ASSERT_TRUE(raft::match(
+    std::pow(12.34f, 2.f), raft::pow(12.34f, 2.f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::pow(12.34, 2.), raft::pow(12.34, 2.), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Sgn)
+{
+  ASSERT_TRUE(raft::match(-1, raft::sgn(-1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, raft::sgn(0), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, raft::sgn(1234), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(-1, raft::sgn(-12.34f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(0, raft::sgn(0.f), raft::Compare<int>()));
+  ASSERT_TRUE(raft::match(1, raft::sgn(12.34f), raft::Compare<int>()));
+}
+
+TEST(MathHost, Sin)
+{
+  ASSERT_TRUE(
+    raft::match(std::sin(12.34f), raft::sin(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::sin(12.34), raft::sin(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, SinCos)
+{
+  float xf = 12.34f;
+  float sf, cf;
+  raft::sincos(xf, &sf, &cf);
+  ASSERT_TRUE(raft::match(std::sin(12.34f), sf, raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(raft::match(std::cos(12.34f), cf, raft::CompareApprox<float>(0.0001f)));
+  double xd = 12.34f;
+  double sd, cd;
+  raft::sincos(xd, &sd, &cd);
+  ASSERT_TRUE(raft::match(std::sin(12.34), sd, raft::CompareApprox<double>(0.000001)));
+  ASSERT_TRUE(raft::match(std::cos(12.34), cd, raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Sqrt)
+{
+  ASSERT_TRUE(
+    raft::match(std::sqrt(12.34f), raft::sqrt(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::sqrt(12.34), raft::sqrt(12.34), raft::CompareApprox<double>(0.000001)));
+}
+
+TEST(MathHost, Tanh)
+{
+  ASSERT_TRUE(
+    raft::match(std::tanh(12.34f), raft::tanh(12.34f), raft::CompareApprox<float>(0.0001f)));
+  ASSERT_TRUE(
+    raft::match(std::tanh(12.34), raft::tanh(12.34), raft::CompareApprox<double>(0.000001)));
+}
diff --git a/cpp/test/mdarray.cu b/cpp/test/core/mdarray.cu
similarity index 99%
rename from cpp/test/mdarray.cu
rename to cpp/test/core/mdarray.cu
index c292feb894..018b8a4e5a 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/core/mdarray.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -340,7 +340,7 @@ void test_factory_methods()
     ASSERT_EQ(h_vec.extent(0), n);
   }
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     // device mdarray
     auto d_matrix = make_device_matrix<float>(handle, n, n);
     ASSERT_EQ(d_matrix.extent(0), n);
@@ -353,7 +353,7 @@ void test_factory_methods()
   }
 
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     // device scalar
     auto d_scalar = make_device_scalar<double>(handle, 17.0);
     static_assert(d_scalar.rank() == 1);
@@ -385,7 +385,7 @@ void test_factory_methods()
 
   // managed
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     auto mda = make_device_vector<int>(handle, 10);
 
     auto mdv = make_managed_mdspan(mda.data_handle(), raft::vector_extent<int>{10});
@@ -416,7 +416,7 @@ void check_matrix_layout(device_matrix_view<T, Index, LayoutPolicy> in)
 
 TEST(MDArray, FuncArg)
 {
-  raft::handle_t handle{};
+  raft::device_resources handle{};
   {
     auto d_matrix = make_device_matrix<float>(handle, 10, 10);
     check_matrix_layout(d_matrix.view());
@@ -918,7 +918,7 @@ void test_mdarray_unravel()
   }
 
   {
-    handle_t handle;
+    raft::device_resources handle;
     auto m   = make_device_matrix<float, size_t>(handle, 7, 6);
     auto m_v = m.view();
     thrust::for_each_n(handle.get_thrust_policy(),
diff --git a/cpp/test/mdspan_utils.cu b/cpp/test/core/mdspan_utils.cu
similarity index 86%
rename from cpp/test/mdspan_utils.cu
rename to cpp/test/core/mdspan_utils.cu
index 7f1efb78bb..4bb689c8c0 100644
--- a/cpp/test/mdspan_utils.cu
+++ b/cpp/test/core/mdspan_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ TEST(MDArray, HostFlatten) { test_host_flatten(); }
 
 void test_device_flatten()
 {
-  raft::handle_t handle{};
+  raft::device_resources handle{};
   // flatten 3d device mdspan
   {
     using three_d_extents = extents<int, dynamic_extent, dynamic_extent, dynamic_extent>;
@@ -179,7 +179,7 @@ void test_reshape()
 
   // reshape 4d device array to 2d
   {
-    raft::handle_t handle{};
+    raft::device_resources handle{};
     using four_d_extents =
       extents<int, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
     using four_d_mdarray = device_mdarray<int, four_d_extents>;
@@ -214,4 +214,31 @@ void test_reshape()
 
 TEST(MDArray, Reshape) { test_reshape(); }
 
+void test_const_mdspan()
+{
+  // 3d host array
+  {
+    using two_d_extents = extents<int, 5, 5>;
+    using two_d_mdarray = host_mdarray<float, two_d_extents>;
+
+    typename two_d_mdarray::mapping_type layout{two_d_extents{}};
+    typename two_d_mdarray::container_policy_type policy;
+    two_d_mdarray mda{layout, policy};
+
+    auto const_mda = make_const_mdspan(mda.view());
+
+    static_assert(std::is_same_v<const float, typename decltype(const_mda)::element_type>,
+                  "elements not the same");
+    static_assert(std::is_same_v<typename decltype(mda)::extents_type,
+                                 typename decltype(const_mda)::extents_type>,
+                  "extents not the same");
+    static_assert(std::is_same_v<typename decltype(mda)::layout_type,
+                                 typename decltype(const_mda)::layout_type>,
+                  "layouts not the same");
+    ASSERT_EQ(mda.size(), const_mda.size());
+  }
+}
+
+TEST(MDSpan, ConstMDSpan) { test_const_mdspan(); }
+
 }  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/memory_type.cpp b/cpp/test/core/memory_type.cpp
similarity index 96%
rename from cpp/test/memory_type.cpp
rename to cpp/test/core/memory_type.cpp
index 57d44ceefe..02aa8caa6c 100644
--- a/cpp/test/memory_type.cpp
+++ b/cpp/test/core/memory_type.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/nvtx.cpp b/cpp/test/core/nvtx.cpp
similarity index 96%
rename from cpp/test/nvtx.cpp
rename to cpp/test/core/nvtx.cpp
index 635fe55012..e6c29fa3d8 100644
--- a/cpp/test/nvtx.cpp
+++ b/cpp/test/core/nvtx.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/common/seive.cu b/cpp/test/core/seive.cu
similarity index 95%
rename from cpp/test/common/seive.cu
rename to cpp/test/core/seive.cu
index 54a59d6251..8634abf3be 100644
--- a/cpp/test/common/seive.cu
+++ b/cpp/test/core/seive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/span.cpp b/cpp/test/core/span.cpp
similarity index 99%
rename from cpp/test/span.cpp
rename to cpp/test/core/span.cpp
index f8d9345a12..1a21b5ff47 100644
--- a/cpp/test/span.cpp
+++ b/cpp/test/core/span.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/span.cu b/cpp/test/core/span.cu
similarity index 99%
rename from cpp/test/span.cu
rename to cpp/test/core/span.cu
index e9af9b857f..f16a18332b 100644
--- a/cpp/test/span.cu
+++ b/cpp/test/core/span.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/test_span.hpp b/cpp/test/core/test_span.hpp
similarity index 99%
rename from cpp/test/test_span.hpp
rename to cpp/test/core/test_span.hpp
index 254c89f91c..27c50e9695 100644
--- a/cpp/test/test_span.hpp
+++ b/cpp/test/core/test_span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 4f6dfaac24..bbd06042c3 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,7 +156,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
   // memory consumption if we use uint8_t instead of bool.
   rmm::device_uvector<uint8_t> dist_ref;
   rmm::device_uvector<uint8_t> dist;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 };
 
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 067b1b2c0e..be7b2b1de8 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ __global__ void naiveDistanceKernel(DataType* dist,
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
-    acc = raft::mySqrt(acc);
+    acc = raft::sqrt(acc);
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
@@ -79,9 +79,9 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
     auto b    = y[yidx];
     auto diff = (a > b) ? (a - b) : (b - a);
     if (type == raft::distance::DistanceType::Linf) {
-      acc = raft::myMax(acc, diff);
+      acc = raft::max(acc, diff);
     } else if (type == raft::distance::DistanceType::Canberra) {
-      const auto add = raft::myAbs(a) + raft::myAbs(b);
+      const auto add = raft::abs(a) + raft::abs(b);
       // deal with potential for 0 in denominator by
       // forcing 1/0 instead
       acc += ((add != 0) * diff / (add + (add == 0)));
@@ -119,7 +119,7 @@ __global__ void naiveCosineDistanceKernel(
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] = (DataType)1.0 - acc_ab / (raft::sqrt(acc_a) * raft::sqrt(acc_b));
 }
 
 template <typename DataType>
@@ -137,7 +137,7 @@ __global__ void naiveHellingerDistanceKernel(
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
     auto a   = x[xidx];
     auto b   = y[yidx];
-    acc_ab += raft::mySqrt(a) * raft::mySqrt(b);
+    acc_ab += raft::sqrt(a) * raft::sqrt(b);
   }
 
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
@@ -145,7 +145,7 @@ __global__ void naiveHellingerDistanceKernel(
   // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
   acc_ab         = 1 - acc_ab;
   auto rectifier = (!signbit(acc_ab));
-  dist[outidx]   = raft::mySqrt(rectifier * acc_ab);
+  dist[outidx]   = raft::sqrt(rectifier * acc_ab);
 }
 
 template <typename DataType>
@@ -167,11 +167,11 @@ __global__ void naiveLpUnexpDistanceKernel(DataType* dist,
     int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto a    = x[xidx];
     auto b    = y[yidx];
-    auto diff = raft::myAbs(a - b);
-    acc += raft::myPow(diff, p);
+    auto diff = raft::abs(a - b);
+    acc += raft::pow(diff, p);
   }
   auto one_over_p = 1 / p;
-  acc             = raft::myPow(acc, one_over_p);
+  acc             = raft::pow(acc, one_over_p);
   int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx]    = acc;
 }
@@ -222,7 +222,7 @@ __global__ void naiveJensenShannonDistanceKernel(
 
     acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
   }
-  acc          = raft::mySqrt(0.5f * acc);
+  acc          = raft::sqrt(0.5f * acc);
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
@@ -297,7 +297,7 @@ __global__ void naiveCorrelationDistanceKernel(
   auto Q_denom = k * a_sq_norm - (a_norm * a_norm);
   auto R_denom = k * b_sq_norm - (b_norm * b_norm);
 
-  acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+  acc = 1 - (numer / raft::sqrt(Q_denom * R_denom));
 
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
@@ -397,7 +397,7 @@ void distanceLauncher(DataType* x,
                       cudaStream_t stream,
                       DataType metric_arg = 2.0f)
 {
-  raft::handle_t handle(stream);
+  raft::device_resources handle(stream);
 
   auto x_v    = make_device_matrix_view<DataType, int, layout>(x, m, k);
   auto y_v    = make_device_matrix_view<DataType, int, layout>(y, n, k);
@@ -483,7 +483,7 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   DistanceInputs<DataType> params;
@@ -519,10 +519,10 @@ class BigMatrixDistanceTest : public ::testing::Test {
   }
 
  protected:
+  raft::device_resources handle;
   int m = 48000;
   int n = 48000;
   int k = 1;
-  raft::handle_t handle;
   rmm::device_uvector<float> x, dist;
 };
 }  // end namespace distance
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 252f56607f..af67214193 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ __global__ void naiveKernel(raft::KeyValuePair<int, DataT>* min,
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  if (Sqrt) { acc = raft::mySqrt(acc); }
+  if (Sqrt) { acc = raft::sqrt(acc); }
   ReduceOpT redOp;
   typedef cub::WarpReduce<raft::KeyValuePair<int, DataT>> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp[NWARPS];
@@ -158,6 +158,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   }
 
  protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
   Inputs<DataT> params;
   rmm::device_uvector<DataT> x;
   rmm::device_uvector<DataT> y;
@@ -166,8 +168,6 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   rmm::device_uvector<raft::KeyValuePair<int, DataT>> min;
   rmm::device_uvector<raft::KeyValuePair<int, DataT>> min_ref;
   rmm::device_uvector<char> workspace;
-  raft::handle_t handle;
-  cudaStream_t stream;
 
   virtual void generateGoldenResult()
   {
@@ -380,12 +380,12 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
   void TearDown() override { FusedL2NNTest<DataT, Sqrt>::TearDown(); }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<raft::KeyValuePair<int, DataT>> min1;
 
-  static const int NumRepeats = 100;
+  static const int NumRepeats = 3;
 
   void generateGoldenResult() override {}
 };
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index 4366e023a0..a2f0e2385c 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,7 +170,7 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
       gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   GramMatrixInputs params;
 
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
new file mode 100644
index 0000000000..c80c984992
--- /dev/null
+++ b/cpp/test/distance/masked_nn.cu
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/kvp.hpp>
+#include <raft/distance/detail/masked_nn.cuh>
+#include <raft/distance/masked_nn.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+namespace raft::distance::masked_nn {
+
+// The adjacency pattern determines what distances get computed.
+enum AdjacencyPattern {
+  checkerboard = 0,  // adjacency matrix looks like a checkerboard (half the distances are computed)
+  checkerboard_4  = 1,  // checkerboard with tiles of size 4x4
+  checkerboard_64 = 2,  // checkerboard with tiles of size 64x64
+  all_true        = 3,  // no distance computations can be skipped
+  all_false       = 4   // all distance computations can be skipped
+};
+
+// Kernels:
+// - init_adj: to initialize the adjacency kernel with a specific adjacency pattern
+// - referenceKernel: to produce the ground-truth output
+
+__global__ void init_adj(AdjacencyPattern pattern,
+                         int n,
+                         raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
+                         raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
+{
+  int m          = adj.extent(0);
+  int num_groups = adj.extent(1);
+
+  for (int idx_m = blockIdx.y * blockDim.y + threadIdx.y; idx_m < m;
+       idx_m += blockDim.y * gridDim.y) {
+    for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups;
+         idx_g += blockDim.x * gridDim.x) {
+      switch (pattern) {
+        case checkerboard: adj(idx_m, idx_g) = (idx_m + idx_g) % 2; break;
+        case checkerboard_4: adj(idx_m, idx_g) = (idx_m / 4 + idx_g) % 2; break;
+        case checkerboard_64: adj(idx_m, idx_g) = (idx_m / 64 + idx_g) % 2; break;
+        case all_true: adj(idx_m, idx_g) = true; break;
+        case all_false: adj(idx_m, idx_g) = false; break;
+        default: assert(false && "unknown pattern");
+      }
+    }
+  }
+  // Each group is of size n / num_groups.
+  //
+  // - group_idxs[j] indicates the start of group j + 1 (i.e. is the inclusive
+  // scan of the group lengths)
+  //
+  // - The first group always starts at index zero, so we do not store it.
+  //
+  // - The group_idxs[num_groups - 1] should always equal n.
+
+  if (blockIdx.y == 0 && threadIdx.y == 0) {
+    const int g_stride = blockDim.x * gridDim.x;
+    for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; idx_g += g_stride) {
+      group_idxs(idx_g) = (idx_g + 1) * (n / num_groups);
+    }
+    group_idxs(num_groups - 1) = n;
+  }
+}
+
+template <typename DataT, typename ReduceOpT, int NWARPS>
+__global__ __launch_bounds__(32 * NWARPS,
+                             2) void referenceKernel(raft::KeyValuePair<int, DataT>* min,
+                                                     DataT* x,
+                                                     DataT* y,
+                                                     bool* adj,
+                                                     int* group_idxs,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     int num_groups,
+                                                     bool sqrt,
+                                                     int* workspace,
+                                                     DataT maxVal)
+{
+  const int m_stride = blockDim.y * gridDim.y;
+  const int m_offset = threadIdx.y + blockIdx.y * blockDim.y;
+  const int n_stride = blockDim.x * gridDim.x;
+  const int n_offset = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int m_grid = 0; m_grid < m; m_grid += m_stride) {
+    for (int n_grid = 0; n_grid < n; n_grid += n_stride) {
+      int midx = m_grid + m_offset;
+      int nidx = n_grid + n_offset;
+
+      // Do a reverse linear search to determine the group index.
+      int group_idx = 0;
+      for (int i = num_groups; 0 <= i; --i) {
+        if (nidx < group_idxs[i]) { group_idx = i; }
+      }
+      const bool include_dist = adj[midx * num_groups + group_idx] && midx < m && nidx < n;
+
+      // Compute L2 metric.
+      DataT acc = DataT(0);
+      for (int i = 0; i < k; ++i) {
+        int xidx  = i + midx * k;
+        int yidx  = i + nidx * k;
+        auto diff = x[xidx] - y[yidx];
+        acc += diff * diff;
+      }
+      if (sqrt) { acc = raft::sqrt(acc); }
+      ReduceOpT redOp;
+      typedef cub::WarpReduce<raft::KeyValuePair<int, DataT>> WarpReduce;
+      __shared__ typename WarpReduce::TempStorage temp[NWARPS];
+      int warpId = threadIdx.x / raft::WarpSize;
+      raft::KeyValuePair<int, DataT> tmp;
+      tmp.key   = include_dist ? nidx : -1;
+      tmp.value = include_dist ? acc : maxVal;
+      tmp       = WarpReduce(temp[warpId]).Reduce(tmp, raft::distance::KVPMinReduce<int, DataT>{});
+      if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
+        while (atomicCAS(workspace + midx, 0, 1) == 1)
+          ;
+        __threadfence();
+        redOp(midx, min + midx, tmp);
+        __threadfence();
+        atomicCAS(workspace + midx, 1, 0);
+      }
+      __syncthreads();
+    }
+  }
+}
+
+// Structs
+// - Params: holds parameters for test case
+// - Inputs: holds the inputs to the functions under test (x, y, adj, group_idxs). Is generated from
+//   the inputs.
+struct Params {
+  double tolerance;
+  int m, n, k, num_groups;
+  bool sqrt;
+  unsigned long long int seed;
+  AdjacencyPattern pattern;
+};
+
+inline auto operator<<(std::ostream& os, const Params& p) -> std::ostream&
+{
+  os << "m: " << p.m << ", n: " << p.n << ", k: " << p.k << ", num_groups: " << p.num_groups
+     << ", sqrt: " << p.sqrt << ", seed: " << p.seed << ", tol: " << p.tolerance;
+  return os;
+}
+
+template <typename DataT>
+struct Inputs {
+  using IdxT = int;
+
+  raft::device_matrix<DataT, IdxT> x, y;
+  raft::device_matrix<bool, IdxT> adj;
+  raft::device_vector<IdxT, IdxT> group_idxs;
+
+  Inputs(const raft::handle_t& handle, const Params& p)
+    : x{raft::make_device_matrix<DataT, IdxT>(handle, p.m, p.k)},
+      y{raft::make_device_matrix<DataT, IdxT>(handle, p.n, p.k)},
+      adj{raft::make_device_matrix<bool, IdxT>(handle, p.m, p.num_groups)},
+      group_idxs{raft::make_device_vector<IdxT, IdxT>(handle, p.num_groups)}
+  {
+    // Initialize x, y
+    raft::random::RngState r(p.seed);
+    uniform(handle, r, x.data_handle(), p.m * p.k, DataT(-1.0), DataT(1.0));
+    uniform(handle, r, y.data_handle(), p.n * p.k, DataT(-1.0), DataT(1.0));
+
+    // Initialize adj, group_idxs.
+    dim3 block(32, 32);
+    dim3 grid(10, 10);
+    init_adj<<<grid, block, 0, handle.get_stream()>>>(
+      p.pattern, p.n, adj.view(), group_idxs.view());
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+};
+
+template <typename DataT, typename OutT = raft::KeyValuePair<int, DataT>>
+auto reference(const raft::handle_t& handle, Inputs<DataT> inp, const Params& p)
+  -> raft::device_vector<OutT, int>
+{
+  int m          = inp.x.extent(0);
+  int n          = inp.y.extent(0);
+  int k          = inp.x.extent(1);
+  int num_groups = inp.group_idxs.extent(0);
+
+  if (m == 0 || n == 0 || k == 0 || num_groups == 0) {
+    return raft::make_device_vector<OutT, int>(handle, 0);
+  }
+
+  // Initialize workspace
+  auto stream = handle.get_stream();
+  rmm::device_uvector<char> workspace(p.m * sizeof(int), stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace.data(), 0, sizeof(int) * m, stream));
+
+  // Initialize output
+  auto out  = raft::make_device_vector<OutT, int>(handle, m);
+  auto blks = raft::ceildiv(m, 256);
+  MinAndDistanceReduceOp<int, DataT> op;
+  raft::distance::detail::initKernel<DataT, raft::KeyValuePair<int, DataT>, int>
+    <<<blks, 256, 0, stream>>>(out.data_handle(), m, std::numeric_limits<DataT>::max(), op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  // Launch reference kernel
+  const int nwarps = 16;
+  static const dim3 TPB(32, nwarps, 1);
+  dim3 nblks(1, 200, 1);
+  referenceKernel<DataT, decltype(op), nwarps>
+    <<<nblks, TPB, 0, stream>>>(out.data_handle(),
+                                inp.x.data_handle(),
+                                inp.y.data_handle(),
+                                inp.adj.data_handle(),
+                                inp.group_idxs.data_handle(),
+                                m,
+                                n,
+                                k,
+                                num_groups,
+                                p.sqrt,
+                                (int*)workspace.data(),
+                                std::numeric_limits<DataT>::max());
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  return out;
+}
+
+template <typename DataT, typename OutT = raft::KeyValuePair<int, DataT>>
+auto run_masked_nn(const raft::handle_t& handle, Inputs<DataT> inp, const Params& p)
+  -> raft::device_vector<OutT, int>
+{
+  // Compute norms:
+  auto x_norm = raft::make_device_vector<DataT, int>(handle, p.m);
+  auto y_norm = raft::make_device_vector<DataT, int>(handle, p.n);
+
+  raft::linalg::norm(handle,
+                     std::as_const(inp.x).view(),
+                     x_norm.view(),
+                     raft::linalg::L2Norm,
+                     raft::linalg::Apply::ALONG_ROWS);
+  raft::linalg::norm(handle,
+                     std::as_const(inp.y).view(),
+                     y_norm.view(),
+                     raft::linalg::L2Norm,
+                     raft::linalg::Apply::ALONG_ROWS);
+
+  // Create parameters for maskedL2NN
+  using IdxT       = int;
+  using RedOpT     = MinAndDistanceReduceOp<int, DataT>;
+  using PairRedOpT = raft::distance::KVPMinReduce<int, DataT>;
+  using ParamT     = raft::distance::MaskedL2NNParams<RedOpT, PairRedOpT>;
+
+  bool init_out = true;
+  ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, p.sqrt, init_out};
+
+  // Create output
+  auto out = raft::make_device_vector<OutT, IdxT, raft::layout_c_contiguous>(handle, p.m);
+
+  // Launch kernel
+  raft::distance::maskedL2NN<DataT, OutT, IdxT>(handle,
+                                                masked_l2_params,
+                                                inp.x.view(),
+                                                inp.y.view(),
+                                                x_norm.view(),
+                                                y_norm.view(),
+                                                inp.adj.view(),
+                                                inp.group_idxs.view(),
+                                                out.view());
+
+  handle.sync_stream();
+
+  return out;
+}
+
+template <typename T>
+struct CompareApproxAbsKVP {
+  typedef typename raft::KeyValuePair<int, T> KVP;
+  CompareApproxAbsKVP(T eps_) : eps(eps_) {}
+  bool operator()(const KVP& a, const KVP& b) const
+  {
+    T diff  = raft::abs(raft::abs(a.value) - raft::abs(b.value));
+    T m     = std::max(raft::abs(a.value), raft::abs(b.value));
+    T ratio = m >= eps ? diff / m : diff;
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename K, typename V, typename L>
+::testing::AssertionResult devArrMatch(const raft::KeyValuePair<K, V>* expected,
+                                       const raft::KeyValuePair<K, V>* actual,
+                                       size_t size,
+                                       L eq_compare,
+                                       cudaStream_t stream = 0)
+{
+  typedef typename raft::KeyValuePair<K, V> KVP;
+  std::shared_ptr<KVP> exp_h(new KVP[size]);
+  std::shared_ptr<KVP> act_h(new KVP[size]);
+  raft::update_host<KVP>(exp_h.get(), expected, size, stream);
+  raft::update_host<KVP>(act_h.get(), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < size; ++i) {
+    auto exp = exp_h.get()[i];
+    auto act = act_h.get()[i];
+    if (!eq_compare(exp, act)) {
+      return ::testing::AssertionFailure()
+             << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << ","
+             << exp.value << " @" << i;
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+inline auto gen_params() -> std::vector<Params>
+{
+  // Regular powers of two
+  auto regular = raft::util::itertools::product<Params>({0.001f},       // tolerance
+                                                        {32, 64, 512},  // m
+                                                        {32, 64, 512},  // n
+                                                        {8, 32},        // k
+                                                        {2, 32},        // num_groups
+                                                        {true, false},  // sqrt
+                                                        {1234ULL},      // seed
+                                                        {AdjacencyPattern::all_true,
+                                                         AdjacencyPattern::checkerboard,
+                                                         AdjacencyPattern::checkerboard_64,
+                                                         AdjacencyPattern::all_false});
+
+  // Irregular sizes to check tiling and bounds checking
+  auto irregular = raft::util::itertools::product<Params>({0.001f},         // tolerance
+                                                          {511, 512, 513},  // m
+                                                          {127, 128, 129},  // n
+                                                          {5},              // k
+                                                          {3, 9},           // num_groups
+                                                          {true, false},    // sqrt
+                                                          {1234ULL},        // seed
+                                                          {AdjacencyPattern::all_true,
+                                                           AdjacencyPattern::checkerboard,
+                                                           AdjacencyPattern::checkerboard_64});
+
+  regular.insert(regular.end(), irregular.begin(), irregular.end());
+
+  return regular;
+}
+
+class MaskedL2NNTest : public ::testing::TestWithParam<Params> {
+  // Empty.
+};
+
+//
+TEST_P(MaskedL2NNTest, ReferenceCheckFloat)
+{
+  using DataT = float;
+
+  // Get parameters; create handle and input data.
+  Params p = GetParam();
+  raft::handle_t handle{};
+  Inputs<DataT> inputs{handle, p};
+
+  // Calculate reference and test output
+  auto out_reference = reference(handle, inputs, p);
+  auto out_fast      = run_masked_nn(handle, inputs, p);
+
+  // Check for differences.
+  ASSERT_TRUE(devArrMatch(out_reference.data_handle(),
+                          out_fast.data_handle(),
+                          p.m,
+                          CompareApproxAbsKVP<DataT>(p.tolerance),
+                          handle.get_stream()));
+}
+
+// This test checks whether running the maskedL2NN twice returns the same
+// output.
+TEST_P(MaskedL2NNTest, DeterminismCheck)
+{
+  using DataT = float;
+
+  // Get parameters; create handle and input data.
+  Params p = GetParam();
+  raft::handle_t handle{};
+  Inputs<DataT> inputs{handle, p};
+
+  // Calculate reference and test output
+  auto out1 = run_masked_nn(handle, inputs, p);
+  auto out2 = run_masked_nn(handle, inputs, p);
+
+  // Check for differences.
+  ASSERT_TRUE(devArrMatch(out1.data_handle(),
+                          out2.data_handle(),
+                          p.m,
+                          CompareApproxAbsKVP<DataT>(p.tolerance),
+                          handle.get_stream()));
+}
+
+TEST_P(MaskedL2NNTest, ReferenceCheckDouble)
+{
+  using DataT = double;
+
+  // Get parameters; create handle and input data.
+  Params p = GetParam();
+  raft::handle_t handle{};
+  Inputs<DataT> inputs{handle, p};
+
+  // Calculate reference and test output
+  auto out_reference = reference(handle, inputs, p);
+  auto out_fast      = run_masked_nn(handle, inputs, p);
+
+  // Check for differences.
+  ASSERT_TRUE(devArrMatch(out_reference.data_handle(),
+                          out_fast.data_handle(),
+                          p.m,
+                          CompareApproxAbsKVP<DataT>(p.tolerance),
+                          handle.get_stream()));
+}
+
+INSTANTIATE_TEST_CASE_P(MaskedL2NNTests, MaskedL2NNTest, ::testing::ValuesIn(gen_params()));
+
+}  // end namespace raft::distance::masked_nn
diff --git a/cpp/test/distance/masked_nn_compress_to_bits.cu b/cpp/test/distance/masked_nn_compress_to_bits.cu
new file mode 100644
index 0000000000..7597362274
--- /dev/null
+++ b/cpp/test/distance/masked_nn_compress_to_bits.cu
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "../test_utils.h"
+#include <cstdio>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/distance/detail/compress_to_bits.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+namespace raft::distance::masked_nn::compress_to_bits {
+
+/**
+ * @brief Transpose and decompress 2D bitfield to boolean matrix
+ *
+ * Inverse operation of compress_to_bits
+ *
+ * @tparam T
+ *
+ * @parameter[in]  in       An `m x n` bitfield matrix. Row major.
+ * @parameter      in_rows  The number of rows of `in`, i.e. `m`.
+ * @parameter      in_cols  The number of cols of `in`, i.e. `n`.
+ *
+ * @parameter[out] out      An `(m * bits_per_elem) x n` boolean matrix.
+ */
+template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
+__global__ void decompress_bits_kernel(const T* in, int in_rows, int in_cols, bool* out)
+{
+  constexpr int bits_per_element = 8 * sizeof(T);
+
+  const size_t i = threadIdx.y + blockIdx.y * blockDim.y;
+  const size_t j = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (in_rows <= i || in_cols <= j) { return; }
+
+  const size_t out_rows = in_rows * bits_per_element;
+  const size_t out_cols = in_cols;
+  const size_t out_i    = i * bits_per_element;
+  const size_t out_j    = j;
+
+  if (out_rows <= out_i && out_cols <= out_j) { return; }
+
+  T bitfield = in[i * in_cols + j];
+  for (int bitpos = 0; bitpos < bits_per_element; ++bitpos) {
+    bool bit                                 = ((T(1) << bitpos) & bitfield) != 0;
+    out[(out_i + bitpos) * out_cols + out_j] = bit;
+  }
+}
+
+/**
+ * @brief Transpose and decompress 2D bitfield to boolean matrix
+ *
+ * Inverse operation of compress_to_bits
+ *
+ * @tparam T
+ *
+ * @parameter[in]  in       An `m x n` bitfield matrix. Row major.
+ * @parameter      in_rows  The number of rows of `in`, i.e. `m`.
+ * @parameter      in_cols  The number of cols of `in`, i.e. `n`.
+ *
+ * @parameter[out] out      An `n x (m * bits_per_elem)` boolean matrix.
+ */
+template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
+void decompress_bits(const raft::handle_t& handle, const T* in, int in_rows, int in_cols, bool* out)
+{
+  auto stream = handle.get_stream();
+  dim3 grid(raft::ceildiv(in_cols, 32), raft::ceildiv(in_rows, 32));
+  dim3 block(32, 32);
+  decompress_bits_kernel<<<grid, block, 0, stream>>>(in, in_rows, in_cols, out);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+// Params holds parameters for test case
+struct Params {
+  int m, n;
+};
+
+inline auto operator<<(std::ostream& os, const Params& p) -> std::ostream&
+{
+  return os << "m: " << p.m << ", n: " << p.n;
+}
+
+// Check that the following holds
+//
+//  decompress(compress(x)) == x
+//
+// for 2D boolean matrices x.
+template <typename T>
+void check_invertible(const Params& p)
+{
+  using raft::distance::detail::compress_to_bits;
+  constexpr int bits_per_elem = sizeof(T) * 8;
+
+  // Make m and n that are safe to ceildiv.
+  int m = raft::round_up_safe(p.m, bits_per_elem);
+  int n = p.n;
+
+  // Generate random input
+  raft::handle_t handle{};
+  raft::random::RngState r(1ULL);
+  auto in = raft::make_device_matrix<bool, int>(handle, m, n);
+  raft::random::bernoulli(handle, r, in.data_handle(), m * n, 0.5f);
+
+  int tmp_m = raft::ceildiv(m, bits_per_elem);
+  int out_m = tmp_m * bits_per_elem;
+
+  auto tmp = raft::make_device_matrix<T, int>(handle, tmp_m, n);
+  auto out = raft::make_device_matrix<bool, int>(handle, out_m, n);
+
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  ASSERT_EQ(in.extent(0), out.extent(0)) << "M does not match";
+  ASSERT_EQ(in.extent(1), out.extent(1)) << "N does not match";
+
+  compress_to_bits(handle, in.view(), tmp.view());
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  decompress_bits(handle, tmp.data_handle(), tmp.extent(0), tmp.extent(1), out.data_handle());
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  // Check for differences.
+  ASSERT_TRUE(raft::devArrMatch(in.data_handle(),
+                                out.data_handle(),
+                                in.extent(0) * in.extent(1),
+                                raft::Compare<bool>(),
+                                handle.get_stream()));
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+void check_all_true(const Params& p)
+{
+  using raft::distance::detail::compress_to_bits;
+  using T                     = uint64_t;
+  constexpr int bits_per_elem = sizeof(T) * 8;
+
+  // Make m and n that are safe to ceildiv.
+  int m = raft::round_up_safe(p.m, bits_per_elem);
+  int n = p.n;
+
+  raft::handle_t handle{};
+  raft::random::RngState r(1ULL);
+  auto in = raft::make_device_matrix<bool, int>(handle, m, n);
+  raft::matrix::fill(handle, in.view(), true);
+
+  int tmp_m = raft::ceildiv(m, bits_per_elem);
+  auto tmp  = raft::make_device_matrix<T, int>(handle, tmp_m, n);
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  compress_to_bits(handle, in.view(), tmp.view());
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  auto expected = raft::make_device_matrix<T, int>(handle, tmp_m, n);
+  raft::matrix::fill(handle, expected.view(), ~T(0));
+
+  // Check for differences.
+  ASSERT_TRUE(raft::devArrMatch(expected.data_handle(),
+                                tmp.data_handle(),
+                                tmp.extent(0) * tmp.extent(1),
+                                raft::Compare<T>(),
+                                handle.get_stream()));
+  handle.sync_stream();
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+class CompressToBitsTest : public ::testing::TestWithParam<Params> {
+  // Empty.
+};
+
+TEST_P(CompressToBitsTest, CheckTrue64) { check_all_true(GetParam()); }
+
+TEST_P(CompressToBitsTest, CheckInvertible64)
+{
+  using T = uint64_t;
+  check_invertible<T>(GetParam());
+}
+
+TEST_P(CompressToBitsTest, CheckInvertible32)
+{
+  using T = uint32_t;
+  check_invertible<T>(GetParam());
+}
+
+std::vector<Params> params = raft::util::itertools::product<Params>(
+  {1, 3, 32, 33, 63, 64, 65, 128, 10013}, {1, 3, 32, 33, 63, 64, 65, 13001});
+
+INSTANTIATE_TEST_CASE_P(CompressToBits, CompressToBitsTest, ::testing::ValuesIn(params));
+
+}  // namespace raft::distance::masked_nn::compress_to_bits
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
deleted file mode 100644
index 2ebc38d03a..0000000000
--- a/cpp/test/handle.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <raft/core/handle.hpp>
-
-namespace raft {
-
-TEST(Raft, HandleDefault)
-{
-  handle_t h;
-  ASSERT_EQ(0, h.get_device());
-  ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream());
-  ASSERT_NE(nullptr, h.get_cublas_handle());
-  ASSERT_NE(nullptr, h.get_cusolver_dn_handle());
-  ASSERT_NE(nullptr, h.get_cusolver_sp_handle());
-  ASSERT_NE(nullptr, h.get_cusparse_handle());
-}
-
-TEST(Raft, Handle)
-{
-  // test stream pool creation
-  constexpr std::size_t n_streams = 4;
-  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
-  handle_t h(rmm::cuda_stream_default, stream_pool);
-  ASSERT_EQ(n_streams, h.get_stream_pool_size());
-
-  // test non default stream handle
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  rmm::cuda_stream_view stream_view(stream);
-  handle_t handle(stream_view);
-  ASSERT_EQ(stream_view, handle.get_stream());
-  handle.sync_stream(stream);
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-TEST(Raft, GetHandleFromPool)
-{
-  constexpr std::size_t n_streams = 4;
-  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
-  handle_t parent(rmm::cuda_stream_default, stream_pool);
-
-  for (std::size_t i = 0; i < n_streams; i++) {
-    auto worker_stream = parent.get_stream_from_stream_pool(i);
-    handle_t child(worker_stream);
-    ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream());
-  }
-}
-
-}  // namespace raft
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index c3d2f82a84..5107015652 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <raft/label/merge_labels.cuh>
 
 #include "../test_utils.cuh"
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -69,7 +69,7 @@ class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MergeLabelsInputs<Index_> params;
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 58fd94f343..f26e41456f 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -66,7 +66,7 @@ void hungarian_test(int problemsize,
                     weight_t epsilon,
                     bool verbose = false)
 {
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize];
 
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 0e5fc40232..3836f714cb 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   AddInputs<InT, OutT> params;
diff --git a/cpp/test/linalg/axpy.cu b/cpp/test/linalg/axpy.cu
index 2eb11f314d..5fd7676792 100644
--- a/cpp/test/linalg/axpy.cu
+++ b/cpp/test/linalg/axpy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ struct AxpyInputs {
 template <typename T, typename IndexType = int>
 class AxpyTest : public ::testing::TestWithParam<AxpyInputs<T>> {
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   AxpyInputs<T, IndexType> params;
   rmm::device_uvector<T> refy;
   rmm::device_uvector<T> y_device_alpha;
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index ac143842cb..9936e665ba 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,8 +30,11 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType, typename OutType>
-void binaryOpLaunch(
-  const raft::handle_t& handle, OutType* out, const InType* in1, const InType* in2, IdxType len)
+void binaryOpLaunch(const raft::device_resources& handle,
+                    OutType* out,
+                    const InType* in1,
+                    const InType* in2,
+                    IdxType len)
 {
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
@@ -66,7 +69,7 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   BinaryOpInputs<InType, IdxType, OutType> params;
@@ -142,7 +145,7 @@ class BinaryOpAlignment : public ::testing::Test {
       z.data() + 9, x.data() + 137, y.data() + 19, 256, raft::add_op{}, handle.get_stream());
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 };
 typedef ::testing::Types<float, double> FloatTypes;
 TYPED_TEST_CASE(BinaryOpAlignment, FloatTypes);
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 9d90b03a6e..fba885957f 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -115,7 +115,7 @@ class CholeskyR1Test : public ::testing::Test {
     }
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   cusolverDnHandle_t solver_handle;
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index dc82ab9511..1309d4c9c1 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,8 +43,12 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T>
-void coalescedReductionLaunch(
-  const raft::handle_t& handle, T* dots, const T* data, int cols, int rows, bool inplace = false)
+void coalescedReductionLaunch(const raft::device_resources& handle,
+                              T* dots,
+                              const T* data,
+                              int cols,
+                              int rows,
+                              bool inplace = false)
 {
   auto dots_view = raft::make_device_vector_view(dots, rows);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
@@ -101,7 +105,7 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   coalescedReductionInputs<T> params;
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 4b5ea0a2dc..6188e891d5 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   UnaryOpInputs<T> params;
diff --git a/cpp/test/linalg/dot.cu b/cpp/test/linalg/dot.cu
index 80a9f24aba..8b8ca374d7 100644
--- a/cpp/test/linalg/dot.cu
+++ b/cpp/test/linalg/dot.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ class DotTest : public ::testing::TestWithParam<DotInputs<T>> {
   {
     params = ::testing::TestWithParam<DotInputs<T>>::GetParam();
 
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     raft::random::RngState r(params.seed);
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 4b834c1aa8..5229e99d20 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -141,7 +141,7 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   EigInputs<T> params;
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index c2b12e5d4a..24e8e83832 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,7 +99,7 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   EigSelInputs<T> params;
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/linalg/eigen_solvers.cu
similarity index 95%
rename from cpp/test/eigen_solvers.cu
rename to cpp/test/linalg/eigen_solvers.cu
index 68b431b894..1f29d7e275 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/linalg/eigen_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <raft/common/nvtx.hpp>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/partition.cuh>
 
@@ -35,7 +35,7 @@ TEST(Raft, EigenSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
   ASSERT_EQ(0,
             h.
 
@@ -81,7 +81,7 @@ TEST(Raft, SpectralSolvers)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
   ASSERT_EQ(0,
             h.
 
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index d9ab7e0984..d8c72991c3 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   ScalarMultiplyInputs<T> params;
@@ -168,7 +168,7 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   EltwiseAddInputs<T> params;
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index a992a32304..47b7e22d5d 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
   {
     params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
 
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     raft::random::RngState r(params.seed);
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 594810bab2..b4f338fdd1 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,7 +85,7 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
   {
     params = ::testing::TestWithParam<GemvInputs<T>>::GetParam();
 
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     raft::random::RngState r(params.seed);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 7e3a1562d9..5b52374789 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include "../test_utils.cuh"
+#include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
@@ -33,7 +34,7 @@ void mapLaunch(OutType* out,
                IdxType len,
                cudaStream_t stream)
 {
-  raft::handle_t handle{stream};
+  raft::device_resources handle{stream};
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
   map(
@@ -99,7 +100,7 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MapInputs<InType, IdxType, OutType> params;
@@ -107,52 +108,70 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
   rmm::device_uvector<OutType> out_ref, out;
 };
 
+template <typename OutType, typename IdxType>
+class MapOffsetTest : public ::testing::TestWithParam<MapInputs<OutType, IdxType, OutType>> {
+ public:
+  MapOffsetTest()
+    : params(::testing::TestWithParam<MapInputs<OutType, IdxType, OutType>>::GetParam()),
+      stream(handle.get_stream()),
+      out_ref(params.len, stream),
+      out(params.len, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    IdxType len    = params.len;
+    OutType scalar = params.scalar;
+    naiveScale(out_ref.data(), (OutType*)nullptr, scalar, len, stream);
+
+    auto out_view = raft::make_device_vector_view(out.data(), len);
+    map_offset(handle,
+               out_view,
+               raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<OutType>(scalar)));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
+
+  MapInputs<OutType, IdxType, OutType> params;
+  rmm::device_uvector<OutType> out_ref, out;
+};
+
+#define MAP_TEST(test_type, test_name, inputs)                       \
+  typedef RAFT_DEPAREN(test_type) test_name;                         \
+  TEST_P(test_name, Result)                                          \
+  {                                                                  \
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(),                    \
+                            this->out.data(),                        \
+                            this->params.len,                        \
+                            CompareApprox(this->params.tolerance))); \
+  }                                                                  \
+  INSTANTIATE_TEST_SUITE_P(MapTests, test_name, ::testing::ValuesIn(inputs))
+
 const std::vector<MapInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}};
-typedef MapTest<float, int> MapTestF_i32;
-TEST_P(MapTestF_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32));
+MAP_TEST((MapTest<float, int>), MapTestF_i32, inputsf_i32);
+MAP_TEST((MapOffsetTest<float, int>), MapOffsetTestF_i32, inputsf_i32);
 
 const std::vector<MapInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}};
-typedef MapTest<float, size_t> MapTestF_i64;
-TEST_P(MapTestF_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64));
+MAP_TEST((MapTest<float, size_t>), MapTestF_i64, inputsf_i64);
+MAP_TEST((MapOffsetTest<float, size_t>), MapOffsetTestF_i64, inputsf_i64);
 
 const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
-typedef MapTest<float, int, double> MapTestF_i32_D;
-TEST_P(MapTestF_i32_D, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
+MAP_TEST((MapTest<float, int, double>), MapTestF_i32_D, inputsf_i32_d);
 
 const std::vector<MapInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}};
-typedef MapTest<double, int> MapTestD_i32;
-TEST_P(MapTestD_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32));
+MAP_TEST((MapTest<double, int>), MapTestD_i32, inputsd_i32);
+MAP_TEST((MapOffsetTest<double, int>), MapOffsetTestD_i32, inputsd_i32);
 
 const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
-typedef MapTest<double, size_t> MapTestD_i64;
-TEST_P(MapTestD_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64));
+MAP_TEST((MapTest<double, size_t>), MapTestD_i64, inputsd_i64);
+MAP_TEST((MapOffsetTest<double, size_t>), MapOffsetTestD_i64, inputsd_i64);
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 1e7f58ec38..ae5058ef3e 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -93,7 +93,7 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MapReduceInputs<InType> params;
@@ -171,7 +171,7 @@ class MapGenericReduceTest : public ::testing::Test {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   int n = 1237;
diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/test/linalg/matrix_vector.cu
index 7018e1da96..602d01f60c 100644
--- a/cpp/test/linalg/matrix_vector.cu
+++ b/cpp/test/linalg/matrix_vector.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ template <typename T, typename IdxType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T, typename IdxType>
-void matrix_vector_op_launch(const raft::handle_t& handle,
+void matrix_vector_op_launch(const raft::device_resources& handle,
                              T* in,
                              const T* vec1,
                              IdxType D,
@@ -98,7 +98,7 @@ void matrix_vector_op_launch(const raft::handle_t& handle,
 }
 
 template <typename T, typename IdxType>
-void naive_matrix_vector_op_launch(const raft::handle_t& handle,
+void naive_matrix_vector_op_launch(const raft::device_resources& handle,
                                    T* in,
                                    const T* vec1,
                                    IdxType D,
@@ -116,7 +116,7 @@ void naive_matrix_vector_op_launch(const raft::handle_t& handle,
     }
   };
   auto operation_bin_div_skip_zero = [] __device__(T mat_element, T vec_element) {
-    if (raft::myAbs(vec_element) < T(1e-10))
+    if (raft::abs(vec_element) < T(1e-10))
       return T(0);
     else
       return mat_element / vec_element;
@@ -183,7 +183,7 @@ class MatrixVectorTest : public ::testing::TestWithParam<MatrixVectorInputs<T, I
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MatrixVectorInputs<T, IdxType> params;
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index e2775c168d..5ba178e212 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,10 @@ template <typename IdxType>
 }
 
 template <typename T, typename LenT>
-inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rng, T* ptr, LenT len)
+inline void gen_uniform(const raft::device_resources& handle,
+                        raft::random::RngState& rng,
+                        T* ptr,
+                        LenT len)
 {
   if constexpr (std::is_integral_v<T>) {
     raft::random::uniformInt(handle, rng, ptr, len, (T)0, (T)100);
@@ -54,7 +57,7 @@ inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rn
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename OpT, typename MatT, typename IdxType, typename Vec1T, typename Vec2T>
-void matrixVectorOpLaunch(const raft::handle_t& handle,
+void matrixVectorOpLaunch(const raft::device_resources& handle,
                           MatT* out,
                           const MatT* in,
                           const Vec1T* vec1,
@@ -156,7 +159,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MatVecOpInputs<IdxType> params;
diff --git a/cpp/test/linalg/mean_squared_error.cu b/cpp/test/linalg/mean_squared_error.cu
index 18e7debcb1..aa1c314e68 100644
--- a/cpp/test/linalg/mean_squared_error.cu
+++ b/cpp/test/linalg/mean_squared_error.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ class MeanSquaredErrorTest : public ::testing::TestWithParam<MeanSquaredErrorInp
  protected:
   MeanSquaredErrorInputs<T> params;
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::device_scalar<T> output;
   rmm::device_scalar<T> refoutput;
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index c90fb93fd0..b8af7515e0 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   UnaryOpInputs<T> params;
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 94540b9ff6..6dfeced6e0 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,10 +56,10 @@ __global__ void naiveRowNormKernel(
       if (type == L2Norm) {
         acc += data[rowStart * D + i] * data[rowStart * D + i];
       } else {
-        acc += raft::myAbs(data[rowStart * D + i]);
+        acc += raft::abs(data[rowStart * D + i]);
       }
     }
-    dots[rowStart] = do_sqrt ? raft::mySqrt(acc) : acc;
+    dots[rowStart] = do_sqrt ? raft::sqrt(acc) : acc;
   }
 }
 
@@ -113,7 +113,7 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   NormInputs<T, IdxT> params;
@@ -131,10 +131,10 @@ __global__ void naiveColNormKernel(
   Type acc = 0;
   for (IdxT i = 0; i < N; i++) {
     Type v = data[colID + i * D];
-    acc += type == L2Norm ? v * v : raft::myAbs(v);
+    acc += type == L2Norm ? v * v : raft::abs(v);
   }
 
-  dots[colID] = do_sqrt ? raft::mySqrt(acc) : acc;
+  dots[colID] = do_sqrt ? raft::sqrt(acc) : acc;
 }
 
 template <typename Type, typename IdxT>
@@ -190,7 +190,7 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   NormInputs<T, IdxT> params;
diff --git a/cpp/test/linalg/normalize.cu b/cpp/test/linalg/normalize.cu
index 0a6786b1ee..24f83a0d0a 100644
--- a/cpp/test/linalg/normalize.cu
+++ b/cpp/test/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ class RowNormalizeTest : public ::testing::TestWithParam<RowNormalizeInputs<T, I
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RowNormalizeInputs<T, IdxT> params;
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index 54c2e2a7aa..20b1fa0e45 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ template <typename Type>
 __global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); }
+  if (idx < len) { out[idx] = raft::pow(in1[idx], in2[idx]); }
 }
 
 template <typename Type>
@@ -43,7 +43,7 @@ template <typename Type>
 __global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); }
+  if (idx < len) { out[idx] = raft::pow(in1[idx], in2); }
 }
 
 template <typename Type>
@@ -113,7 +113,7 @@ class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   PowerInputs<T> params;
   rmm::device_uvector<T> in1, in2, out_ref, out;
   int device_count = 0;
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 4ad382c4f7..8cdeab5a94 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ void reduceLaunch(OutType* dots,
   auto input_view_col_major =
     raft::make_device_matrix_view<const InType, IdxType, raft::col_major>(data, rows, cols);
 
-  raft::handle_t handle{stream};
+  raft::device_resources handle{stream};
 
   if (rowMajor) {
     reduce(handle,
@@ -184,7 +184,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType,
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   ReduceInputs<InType, OutType, IdxType> params;
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
index 3870bfb830..037a6a86e0 100644
--- a/cpp/test/linalg/reduce_cols_by_key.cu
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T, IdxT>
   {
     params = ::testing::TestWithParam<ReduceColsInputs<T, IdxT>>::GetParam();
     raft::random::RngState r(params.seed);
-    raft::handle_t handle;
+    raft::device_resources handle;
     auto stream = handle.get_stream();
     auto nrows  = params.rows;
     auto ncols  = params.cols;
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index 81f1817f74..69bacb0631 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
 
  protected:
   ReduceRowsInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
 
   int device_count = 0;
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 04e17468c3..ba2572b5a9 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/rsvd.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -64,7 +64,7 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
   void SetUp() override
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
     stream = handle.get_stream();
 
     params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
@@ -272,7 +272,7 @@ TEST_P(RsvdSanityCheckRightVecD, Result)
 typedef RsvdTest<float> RsvdTestSquareMatrixNormF;
 TEST_P(RsvdTestSquareMatrixNormF, Result)
 {
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
                                                 A.data(),
@@ -289,7 +289,7 @@ TEST_P(RsvdTestSquareMatrixNormF, Result)
 typedef RsvdTest<double> RsvdTestSquareMatrixNormD;
 TEST_P(RsvdTestSquareMatrixNormD, Result)
 {
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
                                                 A.data(),
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index 9008313b58..7ee31da874 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ template <typename Type>
 __global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); }
+  if (idx < len) { out[idx] = raft::sqrt(in1[idx]); }
 }
 
 template <typename Type>
@@ -82,7 +82,7 @@ class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   SqrtInputs<T> params;
   rmm::device_uvector<T> in1, out_ref, out;
   int device_count = 0;
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 6a8c43ad52..c9b32c3585 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ template <typename T>
 void stridedReductionLaunch(
   T* dots, const T* data, int cols, int rows, bool inplace, cudaStream_t stream)
 {
-  raft::handle_t handle{stream};
+  raft::device_resources handle{stream};
   auto dots_view = raft::make_device_vector_view(dots, cols);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
   strided_reduction(handle, data_view, dots_view, (T)0, inplace, raft::sq_op{});
@@ -91,7 +91,7 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   stridedReductionInputs<T> params;
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 426fc98f9f..222e64fc3c 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,7 +108,7 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SubtractInputs<T> params;
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 7918d481db..9eee0f538e 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SvdInputs<T> params;
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
index c78df08820..3eadae95ae 100644
--- a/cpp/test/linalg/ternary_op.cu
+++ b/cpp/test/linalg/ternary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
 
  protected:
   BinaryOpInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
 
   rmm::device_uvector<T> out_add_ref, out_add, out_mul_ref, out_mul;
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 110dc527d3..9644ee53db 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   TranposeInputs<T> params;
@@ -133,7 +133,7 @@ namespace {
  * @return The transposed matrix.
  */
 template <typename T, typename IndexType, typename LayoutPolicy>
-[[nodiscard]] auto transpose(handle_t const& handle,
+[[nodiscard]] auto transpose(raft::device_resources const& handle,
                              device_matrix_view<T, IndexType, LayoutPolicy> in)
   -> std::enable_if_t<std::is_floating_point_v<T> &&
                         (std::is_same_v<LayoutPolicy, layout_c_contiguous> ||
@@ -158,7 +158,7 @@ template <typename T, typename IndexType, typename LayoutPolicy>
  * @return The transposed matrix.
  */
 template <typename T, typename IndexType>
-[[nodiscard]] auto transpose(handle_t const& handle,
+[[nodiscard]] auto transpose(raft::device_resources const& handle,
                              device_matrix_view<T, IndexType, layout_stride> in)
   -> std::enable_if_t<std::is_floating_point_v<T>, device_matrix<T, IndexType, layout_stride>>
 {
@@ -188,7 +188,7 @@ template <typename T, typename IndexType>
 template <typename T, typename LayoutPolicy>
 void test_transpose_with_mdspan()
 {
-  handle_t handle;
+  raft::device_resources handle;
   auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 3);
   T k{0};
   for (size_t i = 0; i < v.extent(0); ++i) {
@@ -223,7 +223,7 @@ namespace {
 template <typename T, typename LayoutPolicy>
 void test_transpose_submatrix()
 {
-  handle_t handle;
+  raft::device_resources handle;
   auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 33);
   T k{0};
   size_t row_beg{3}, row_end{13}, col_beg{2}, col_end{11};
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 3ebf70e69f..278eac348b 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "../test_utils.cuh"
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -24,27 +25,6 @@
 namespace raft {
 namespace linalg {
 
-// Or else, we get the following compilation error
-// for an extended __device__ lambda cannot have private or protected access
-// within its class
-template <typename InType, typename IdxType = int, typename OutType = InType>
-void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
-  raft::handle_t handle{stream};
-  auto out_view = raft::make_device_vector_view(out, len);
-  auto in_view  = raft::make_device_vector_view<const InType>(in, len);
-  if (in == nullptr) {
-    auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
-      *ptr = static_cast<OutType>(scalar * idx);
-    };
-
-    write_only_unary_op(handle, out_view, op);
-  } else {
-    auto op = [scalar] __device__(InType in) { return static_cast<OutType>(in * scalar); };
-    unary_op(handle, in_view, out_view, op);
-  }
-}
-
 template <typename InType, typename IdxType, typename OutType = InType>
 class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  public:
@@ -71,14 +51,18 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     auto len    = params.len;
     auto scalar = params.scalar;
     naiveScale(out_ref.data(), in.data(), scalar, len, stream);
-    unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
+
+    auto in_view  = raft::make_device_vector_view<const InType>(in.data(), len);
+    auto out_view = raft::make_device_vector_view(out.data(), len);
+    unary_op(handle,
+             in_view,
+             out_view,
+             raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<InType>(scalar)));
     handle.sync_stream(stream);
-    ASSERT_TRUE(devArrMatch(
-      out_ref.data(), out.data(), params.len, CompareApprox<OutType>(params.tolerance)));
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   UnaryOpInputs<InType, IdxType, OutType> params;
@@ -86,6 +70,22 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
   rmm::device_uvector<OutType> out_ref, out;
 };
 
+// Or else, we get the following compilation error:
+// The enclosing parent function ("DoTest") for an extended __device__ lambda cannot have private or
+// protected access within its class
+template <typename InType, typename IdxType, typename OutType>
+void launchWriteOnlyUnaryOp(const raft::device_resources& handle,
+                            OutType* out,
+                            InType scalar,
+                            IdxType len)
+{
+  auto out_view = raft::make_device_vector_view(out, len);
+  auto op       = [scalar] __device__(OutType * ptr, IdxType idx) {
+    *ptr = static_cast<OutType>(scalar * idx);
+  };
+  write_only_unary_op(handle, out_view, op);
+}
+
 template <typename OutType, typename IdxType>
 class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
  protected:
@@ -94,50 +94,46 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
     auto len    = this->params.len;
     auto scalar = this->params.scalar;
     naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream);
-    unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(this->out_ref.data(),
-                            this->out.data(),
-                            this->params.len,
-                            CompareApprox<OutType>(this->params.tolerance)));
+
+    launchWriteOnlyUnaryOp(this->handle, this->out.data(), scalar, len);
+    this->handle.sync_stream(this->stream);
   }
 };
 
-#define UNARY_OP_TEST(Name, inputs)  \
-  TEST_P(Name, Result) { DoTest(); } \
-  INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
+#define UNARY_OP_TEST(test_type, test_name, inputs)                  \
+  typedef RAFT_DEPAREN(test_type) test_name;                         \
+  TEST_P(test_name, Result)                                          \
+  {                                                                  \
+    DoTest();                                                        \
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(),                    \
+                            this->out.data(),                        \
+                            this->params.len,                        \
+                            CompareApprox(this->params.tolerance))); \
+  }                                                                  \
+  INSTANTIATE_TEST_SUITE_P(UnaryOpTests, test_name, ::testing::ValuesIn(inputs))
 
 const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
-UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
-typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i32, inputsf_i32);
+UNARY_OP_TEST((UnaryOpTest<float, int>), UnaryOpTestF_i32, inputsf_i32);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<float, int>), WriteOnlyUnaryOpTestF_i32, inputsf_i32);
 
 const std::vector<UnaryOpInputs<float, size_t>> inputsf_i64 = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-typedef UnaryOpTest<float, size_t> UnaryOpTestF_i64;
-UNARY_OP_TEST(UnaryOpTestF_i64, inputsf_i64);
-typedef WriteOnlyUnaryOpTest<float, size_t> WriteOnlyUnaryOpTestF_i64;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestF_i64, inputsf_i64);
+UNARY_OP_TEST((UnaryOpTest<float, size_t>), UnaryOpTestF_i64, inputsf_i64);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<float, size_t>), WriteOnlyUnaryOpTestF_i64, inputsf_i64);
 
 const std::vector<UnaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-typedef UnaryOpTest<float, int, double> UnaryOpTestF_i32_D;
-UNARY_OP_TEST(UnaryOpTestF_i32_D, inputsf_i32_d);
+UNARY_OP_TEST((UnaryOpTest<float, int, double>), UnaryOpTestF_i32_D, inputsf_i32_d);
 
 const std::vector<UnaryOpInputs<double, int>> inputsd_i32 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
-typedef UnaryOpTest<double, int> UnaryOpTestD_i32;
-UNARY_OP_TEST(UnaryOpTestD_i32, inputsd_i32);
-typedef WriteOnlyUnaryOpTest<double, int> WriteOnlyUnaryOpTestD_i32;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i32, inputsd_i32);
+UNARY_OP_TEST((UnaryOpTest<double, int>), UnaryOpTestD_i32, inputsd_i32);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<double, int>), WriteOnlyUnaryOpTestD_i32, inputsd_i32);
 
 const std::vector<UnaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
-typedef UnaryOpTest<double, size_t> UnaryOpTestD_i64;
-UNARY_OP_TEST(UnaryOpTestD_i64, inputsd_i64);
-typedef WriteOnlyUnaryOpTest<double, size_t> WriteOnlyUnaryOpTestD_i64;
-UNARY_OP_TEST(WriteOnlyUnaryOpTestD_i64, inputsd_i64);
+UNARY_OP_TEST((UnaryOpTest<double, size_t>), UnaryOpTestD_i64, inputsd_i64);
+UNARY_OP_TEST((WriteOnlyUnaryOpTest<double, size_t>), WriteOnlyUnaryOpTestD_i64, inputsd_i64);
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 28bcc004a4..9d2bd6f7c9 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ __global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar,
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
     if (in == nullptr) {
-      // used for testing writeOnlyUnaryOp
+      // used for testing write_only_unary_op
       out[idx] = static_cast<OutType>(scalar * idx);
     } else {
       out[idx] = static_cast<OutType>(scalar * in[idx]);
diff --git a/cpp/test/matrix/argmax.cu b/cpp/test/matrix/argmax.cu
index 33af0ce5a4..ec27b530d7 100644
--- a/cpp/test/matrix/argmax.cu
+++ b/cpp/test/matrix/argmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ class ArgMaxTest : public ::testing::TestWithParam<ArgMaxInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   ArgMaxInputs<T, IdxT> params;
 
   raft::device_matrix<T, std::uint32_t, row_major> input;
diff --git a/cpp/test/matrix/argmin.cu b/cpp/test/matrix/argmin.cu
index 22f0a6cac0..73f6123167 100644
--- a/cpp/test/matrix/argmin.cu
+++ b/cpp/test/matrix/argmin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ class ArgMinTest : public ::testing::TestWithParam<ArgMinInputs<T, IdxT>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   ArgMinInputs<T, IdxT> params;
 
   raft::device_matrix<T, std::uint32_t, row_major> input;
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu
index 000a911efd..2292772b1a 100644
--- a/cpp/test/matrix/columnSort.cu
+++ b/cpp/test/matrix/columnSort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,10 +116,10 @@ class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   columnSort<T> params;
   rmm::device_uvector<T> keyIn, keySorted, keySortGolden;
   rmm::device_uvector<int> valueOut, goldenValOut;  // valueOut are indexes
-  raft::handle_t handle;
 };
 
 const std::vector<columnSort<float>> inputsf1 = {{0.000001f, 503, 2000, false},
diff --git a/cpp/test/matrix/diagonal.cu b/cpp/test/matrix/diagonal.cu
index f6cd178b23..118aa7988f 100644
--- a/cpp/test/matrix/diagonal.cu
+++ b/cpp/test/matrix/diagonal.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ class DiagonalTest : public ::testing::TestWithParam<DiagonalInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   DiagonalInputs<T> params;
 
   int diag_size;
diff --git a/cpp/test/matrix/gather.cu b/cpp/test/matrix/gather.cu
index 0bea62e9cf..37c2067c77 100644
--- a/cpp/test/matrix/gather.cu
+++ b/cpp/test/matrix/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,50 +18,72 @@
 #include <gtest/gtest.h>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>
 #include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 
-template <typename MatrixIteratorT, typename MapIteratorT>
-void naiveGatherImpl(
-  MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out)
+template <bool Conditional,
+          bool MapTransform,
+          typename InputIteratorT,
+          typename MapIteratorT,
+          typename StencilIteratorT,
+          typename UnaryPredicateOp,
+          typename MapTransformOp,
+          typename OutputIteratorT,
+          typename IdxT>
+void naiveGather(InputIteratorT in,
+                 IdxT D,
+                 IdxT N,
+                 MapIteratorT map,
+                 StencilIteratorT stencil,
+                 IdxT map_length,
+                 OutputIteratorT out,
+                 UnaryPredicateOp pred_op,
+                 MapTransformOp transform_op)
 {
-  for (int outRow = 0; outRow < map_length; ++outRow) {
+  for (IdxT outRow = 0; outRow < map_length; ++outRow) {
+    if constexpr (Conditional) {
+      auto stencil_val = stencil[outRow];
+      if (!pred_op(stencil_val)) continue;
+    }
     typename std::iterator_traits<MapIteratorT>::value_type map_val = map[outRow];
-    int inRowStart                                                  = map_val * D;
-    int outRowStart                                                 = outRow * D;
-    for (int i = 0; i < D; ++i) {
+    IdxT transformed_val;
+    if constexpr (MapTransform) {
+      transformed_val = transform_op(map_val);
+    } else {
+      transformed_val = map_val;
+    }
+    IdxT inRowStart  = transformed_val * D;
+    IdxT outRowStart = outRow * D;
+    for (IdxT i = 0; i < D; ++i) {
       out[outRowStart + i] = in[inRowStart + i];
     }
   }
 }
 
-template <typename MatrixIteratorT, typename MapIteratorT>
-void naiveGather(
-  MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out)
-{
-  naiveGatherImpl(in, D, N, map, map_length, out);
-}
-
+template <typename IdxT>
 struct GatherInputs {
-  uint32_t nrows;
-  uint32_t ncols;
-  uint32_t map_length;
+  IdxT nrows;
+  IdxT ncols;
+  IdxT map_length;
   unsigned long long int seed;
 };
 
-template <typename MatrixT, typename MapT>
-class GatherTest : public ::testing::TestWithParam<GatherInputs> {
+template <bool Conditional, bool MapTransform, typename MatrixT, typename MapT, typename IdxT>
+class GatherTest : public ::testing::TestWithParam<GatherInputs<IdxT>> {
  protected:
   GatherTest()
     : stream(handle.get_stream()),
-      params(::testing::TestWithParam<GatherInputs>::GetParam()),
+      params(::testing::TestWithParam<GatherInputs<IdxT>>::GetParam()),
       d_in(0, stream),
       d_out_exp(0, stream),
       d_out_act(0, stream),
+      d_stencil(0, stream),
       d_map(0, stream)
   {
   }
@@ -71,86 +93,118 @@ class GatherTest : public ::testing::TestWithParam<GatherInputs> {
     raft::random::RngState r(params.seed);
     raft::random::RngState r_int(params.seed);
 
-    uint32_t nrows      = params.nrows;
-    uint32_t ncols      = params.ncols;
-    uint32_t map_length = params.map_length;
-    uint32_t len        = nrows * ncols;
+    IdxT map_length = params.map_length;
+    IdxT len        = params.nrows * params.ncols;
 
     // input matrix setup
-    d_in.resize(nrows * ncols, stream);
-    h_in.resize(nrows * ncols);
+    d_in.resize(params.nrows * params.ncols, stream);
+    h_in.resize(params.nrows * params.ncols);
     raft::random::uniform(handle, r, d_in.data(), len, MatrixT(-1.0), MatrixT(1.0));
     raft::update_host(h_in.data(), d_in.data(), len, stream);
 
     // map setup
     d_map.resize(map_length, stream);
     h_map.resize(map_length);
-    raft::random::uniformInt(handle, r_int, d_map.data(), map_length, (MapT)0, nrows);
+    raft::random::uniformInt(handle, r_int, d_map.data(), map_length, (MapT)0, (MapT)params.nrows);
     raft::update_host(h_map.data(), d_map.data(), map_length, stream);
 
-    // expected and actual output matrix setup
-    h_out.resize(map_length * ncols);
-    d_out_exp.resize(map_length * ncols, stream);
-    d_out_act.resize(map_length * ncols, stream);
+    // stencil setup
+    if (Conditional) {
+      d_stencil.resize(map_length, stream);
+      h_stencil.resize(map_length);
+      raft::random::uniform(handle, r, d_stencil.data(), map_length, MatrixT(-1.0), MatrixT(1.0));
+      raft::update_host(h_stencil.data(), d_stencil.data(), map_length, stream);
+    }
 
-    // launch gather on the host and copy the results to device
-    naiveGather(h_in.data(), ncols, nrows, h_map.data(), map_length, h_out.data());
-    raft::update_device(d_out_exp.data(), h_out.data(), map_length * ncols, stream);
+    // unary predicate op (used only when Conditional is true)
+    auto pred_op = raft::plug_const_op(MatrixT(0.0), raft::greater_op());
 
-    auto in_view = raft::make_device_matrix_view<const MatrixT, std::uint32_t, row_major>(
-      d_in.data(), nrows, ncols);
-    auto out_view =
-      raft::make_device_matrix_view<MatrixT, std::uint32_t>(d_out_act.data(), map_length, ncols);
-    auto map_view =
-      raft::make_device_vector_view<const MapT, std::uint32_t, row_major>(d_map.data(), map_length);
+    // map transform op (used only when MapTransform is true)
+    auto transform_op =
+      raft::compose_op(raft::mod_const_op<IdxT>(params.nrows), raft::add_const_op<IdxT>(10));
 
-    raft::matrix::gather(handle, in_view, map_view, out_view);
+    // expected and actual output matrix setup
+    h_out.resize(map_length * params.ncols);
+    d_out_exp.resize(map_length * params.ncols, stream);
+    d_out_act.resize(map_length * params.ncols, stream);
 
-    //      // launch device version of the kernel
-    //    gatherLaunch(
-    //      handle, d_in.data(), ncols, nrows, d_map.data(), map_length, d_out_act.data(), stream);
+    // launch gather on the host and copy the results to device
+    naiveGather<Conditional, MapTransform>(h_in.data(),
+                                           params.ncols,
+                                           params.nrows,
+                                           h_map.data(),
+                                           h_stencil.data(),
+                                           map_length,
+                                           h_out.data(),
+                                           pred_op,
+                                           transform_op);
+    raft::update_device(d_out_exp.data(), h_out.data(), map_length * params.ncols, stream);
+
+    auto in_view = raft::make_device_matrix_view<const MatrixT, IdxT, row_major>(
+      d_in.data(), params.nrows, params.ncols);
+    auto out_view = raft::make_device_matrix_view<MatrixT, IdxT, row_major>(
+      d_out_act.data(), map_length, params.ncols);
+    auto map_view = raft::make_device_vector_view<const MapT, IdxT>(d_map.data(), map_length);
+    auto stencil_view =
+      raft::make_device_vector_view<const MatrixT, IdxT>(d_stencil.data(), map_length);
+
+    if (Conditional && MapTransform) {
+      raft::matrix::gather_if(
+        handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
+    } else if (Conditional) {
+      raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op);
+    } else if (MapTransform) {
+      raft::matrix::gather(handle, in_view, map_view, out_view, transform_op);
+    } else {
+      raft::matrix::gather(handle, in_view, map_view, out_view);
+    }
 
     handle.sync_stream(stream);
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
-  GatherInputs params;
-  std::vector<MatrixT> h_in, h_out;
+  GatherInputs<IdxT> params;
+  std::vector<MatrixT> h_in, h_out, h_stencil;
   std::vector<MapT> h_map;
-  rmm::device_uvector<MatrixT> d_in, d_out_exp, d_out_act;
+  rmm::device_uvector<MatrixT> d_in, d_out_exp, d_out_act, d_stencil;
   rmm::device_uvector<MapT> d_map;
 };
 
-const std::vector<GatherInputs> inputs = {{1024, 32, 128, 1234ULL},
-                                          {1024, 32, 256, 1234ULL},
-                                          {1024, 32, 512, 1234ULL},
-                                          {1024, 32, 1024, 1234ULL},
-                                          {1024, 64, 128, 1234ULL},
-                                          {1024, 64, 256, 1234ULL},
-                                          {1024, 64, 512, 1234ULL},
-                                          {1024, 64, 1024, 1234ULL},
-                                          {1024, 128, 128, 1234ULL},
-                                          {1024, 128, 256, 1234ULL},
-                                          {1024, 128, 512, 1234ULL},
-                                          {1024, 128, 1024, 1234ULL}};
-
-typedef GatherTest<float, uint32_t> GatherTestF;
-TEST_P(GatherTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<float>()));
-}
-
-typedef GatherTest<double, uint32_t> GatherTestD;
-TEST_P(GatherTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<double>()));
-}
-
-INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestF, ::testing::ValuesIn(inputs));
-INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestD, ::testing::ValuesIn(inputs));
+#define GATHER_TEST(test_type, test_name, test_inputs)       \
+  typedef RAFT_DEPAREN(test_type) test_name;                 \
+  TEST_P(test_name, Result)                                  \
+  {                                                          \
+    ASSERT_TRUE(devArrMatch(d_out_exp.data(),                \
+                            d_out_act.data(),                \
+                            params.map_length* params.ncols, \
+                            raft::Compare<float>()));        \
+  }                                                          \
+  INSTANTIATE_TEST_CASE_P(GatherTests, test_name, ::testing::ValuesIn(test_inputs))
+
+const std::vector<GatherInputs<int>> inputs_i32 =
+  raft::util::itertools::product<GatherInputs<int>>({25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL});
+const std::vector<GatherInputs<int64_t>> inputs_i64 =
+  raft::util::itertools::product<GatherInputs<int64_t>>(
+    {25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL});
+
+GATHER_TEST((GatherTest<false, false, float, uint32_t, int>), GatherTestFU32I32, inputs_i32);
+GATHER_TEST((GatherTest<false, true, float, uint32_t, int>),
+            GatherTransformTestFU32I32,
+            inputs_i32);
+GATHER_TEST((GatherTest<true, false, float, uint32_t, int>), GatherIfTestFU32I32, inputs_i32);
+GATHER_TEST((GatherTest<true, true, float, uint32_t, int>),
+            GatherIfTransformTestFU32I32,
+            inputs_i32);
+GATHER_TEST((GatherTest<true, true, double, uint32_t, int>),
+            GatherIfTransformTestDU32I32,
+            inputs_i32);
+GATHER_TEST((GatherTest<true, true, float, uint32_t, int64_t>),
+            GatherIfTransformTestFU32I64,
+            inputs_i64);
+GATHER_TEST((GatherTest<true, true, float, int64_t, int64_t>),
+            GatherIfTransformTestFI64I64,
+            inputs_i64);
 
 }  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 9ce1371944..04a8a91b01 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,8 +43,8 @@ struct LinewiseTestParams {
 
 template <typename T, typename I, typename ParamsReader>
 struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Params> {
+  const raft::device_resources handle;
   const LinewiseTestParams params;
-  const raft::handle_t handle;
   rmm::cuda_stream_view stream;
 
   LinewiseTest()
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index f2c1a6249c..cd3d865d80 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ template <typename Type>
 __global__ void naiveSqrtKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = std::sqrt(in[idx]); }
+  if (idx < len) { out[idx] = raft::sqrt(in[idx]); }
 }
 
 template <typename Type>
@@ -207,7 +207,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MathInputs<T> params;
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 8cfbdac32b..10105203f7 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MatrixInputs<T> params;
@@ -161,7 +161,7 @@ class MatrixCopyRowsTest : public ::testing::Test {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   int n_rows     = 10;
diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
index b1e10c9047..ed1c393c4f 100644
--- a/cpp/test/matrix/norm.cu
+++ b/cpp/test/matrix/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   NormInputs<T> params;
diff --git a/cpp/test/matrix/reverse.cu b/cpp/test/matrix/reverse.cu
index 49d501b6d0..f3929c582b 100644
--- a/cpp/test/matrix/reverse.cu
+++ b/cpp/test/matrix/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ class ReverseTest : public ::testing::TestWithParam<ReverseInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   ReverseInputs<T> params;
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
new file mode 100644
index 0000000000..344e5b5748
--- /dev/null
+++ b/cpp/test/matrix/select_k.cu
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft_internal/matrix/select_k.cuh>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::matrix {
+
+template <typename IdxT>
+auto gen_simple_ids(int batch_size, int len) -> std::vector<IdxT>
+{
+  std::vector<IdxT> out(batch_size * len);
+  auto s = rmm::cuda_stream_default;
+  rmm::device_uvector<IdxT> out_d(out.size(), s);
+  sparse::iota_fill(out_d.data(), IdxT(batch_size), IdxT(len), s);
+  update_host(out.data(), out_d.data(), out.size(), s);
+  s.synchronize();
+  return out;
+}
+
+template <typename KeyT, typename IdxT>
+struct io_simple {
+ public:
+  bool not_supported = false;
+
+  io_simple(const select::params& spec,
+            const std::vector<KeyT>& in_dists,
+            const std::vector<KeyT>& out_dists,
+            const std::vector<IdxT>& out_ids)
+    : in_dists_(in_dists),
+      in_ids_(gen_simple_ids<IdxT>(spec.batch_size, spec.len)),
+      out_dists_(out_dists),
+      out_ids_(out_ids)
+  {
+  }
+
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+};
+
+template <typename KeyT, typename IdxT>
+struct io_computed {
+ public:
+  bool not_supported = false;
+
+  io_computed(const select::params& spec,
+              const select::Algo& algo,
+              const std::vector<KeyT>& in_dists,
+              const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
+    : in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.batch_size, spec.len))),
+      out_dists_(spec.batch_size * spec.k),
+      out_ids_(spec.batch_size * spec.k)
+  {
+    // check if the size is supported by the algorithm
+    switch (algo) {
+      case select::Algo::kWarpAuto:
+      case select::Algo::kWarpImmediate:
+      case select::Algo::kWarpFiltered:
+      case select::Algo::kWarpDistributed:
+      case select::Algo::kWarpDistributedShm: {
+        if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) {
+          not_supported = true;
+          return;
+        }
+      } break;
+      default: break;
+    }
+
+    device_resources handle{};
+    auto stream = handle.get_stream();
+
+    rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
+    rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
+    rmm::device_uvector<KeyT> out_dists_d(out_dists_.size(), stream);
+    rmm::device_uvector<IdxT> out_ids_d(out_ids_.size(), stream);
+
+    update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream);
+    update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream);
+
+    select::select_k_impl<KeyT, IdxT>(handle,
+                                      algo,
+                                      in_dists_d.data(),
+                                      spec.use_index_input ? in_ids_d.data() : nullptr,
+                                      spec.batch_size,
+                                      spec.len,
+                                      spec.k,
+                                      out_dists_d.data(),
+                                      out_ids_d.data(),
+                                      spec.select_min);
+
+    update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream);
+    update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream);
+
+    interruptible::synchronize(stream);
+
+    auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+    apply_permutation(out_dists_, p);
+    apply_permutation(out_ids_, p);
+  }
+
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+
+  auto topk_sort_permutation(const std::vector<KeyT>& vec,
+                             const std::vector<IdxT>& inds,
+                             int k,
+                             bool select_min) -> std::vector<IdxT>
+  {
+    std::vector<IdxT> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    if (select_min) {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] < vec[j];
+        }
+        return ik < jk;
+      });
+    } else {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] > vec[j];
+        }
+        return ik < jk;
+      });
+    }
+    return p;
+  }
+
+  template <typename T>
+  void apply_permutation(std::vector<T>& vec, const std::vector<IdxT>& p)  // NOLINT
+  {
+    for (auto i = IdxT(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+};
+
+template <typename InOut>
+using Params = std::tuple<select::params, select::Algo, InOut>;
+
+template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
+struct SelectK  // NOLINT
+  : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::params_t> {
+  const select::params spec;
+  const select::Algo algo;
+  typename ParamsReader<KeyT, IdxT>::io_t ref;
+  io_computed<KeyT, IdxT> res;
+
+  explicit SelectK(Params<typename ParamsReader<KeyT, IdxT>::io_t> ps)
+    : spec(std::get<0>(ps)),
+      algo(std::get<1>(ps)),                                 // NOLINT
+      ref(std::get<2>(ps)),                                  // NOLINT
+      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())  // NOLINT
+  {
+  }
+
+  explicit SelectK(typename ParamsReader<KeyT, IdxT>::params_t ps)
+    : SelectK(ParamsReader<KeyT, IdxT>::read(ps))
+  {
+  }
+
+  SelectK()
+    : SelectK(testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::params_t>::GetParam())
+  {
+  }
+
+  void run()
+  {
+    if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
+
+    // If the dists (keys) are the same, different corresponding ids may end up in the selection due
+    // to non-deterministic nature of some implementations.
+    auto& in_ids     = ref.get_in_ids();
+    auto& in_dists   = ref.get_in_dists();
+    auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
+      if (i == j) return true;
+      auto ix_i = size_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
+      auto ix_j = size_t(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
+      if (ix_i >= in_ids.size() || ix_j >= in_ids.size()) return false;
+      auto dist_i = in_dists[ix_i];
+      auto dist_j = in_dists[ix_j];
+      if (dist_i == dist_j) return true;
+      std::cout << "ERROR: ref[" << ix_i << "] = " << dist_i << " != "
+                << "res[" << ix_j << "] = " << dist_j << std::endl;
+      return false;
+    };
+    ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), compare_ids));
+  }
+};
+
+template <typename KeyT, typename IdxT>
+struct params_simple {
+  using io_t = io_simple<KeyT, IdxT>;
+  using input_t =
+    std::tuple<select::params, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using params_t = std::tuple<input_t, select::Algo>;
+
+  static auto read(params_t ps) -> Params<io_t>
+  {
+    auto ins  = std::get<0>(ps);
+    auto algo = std::get<1>(ps);
+    return std::make_tuple(
+      std::get<0>(ins),
+      algo,
+      io_simple<KeyT, IdxT>(
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+  }
+};
+
+auto inputs_simple_f = testing::Values(
+  params_simple<float, int>::input_t(
+    {5, 5, 5, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::input_t(
+    {5, 5, 3, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::input_t(
+    {5, 5, 5, true, false},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::input_t(
+    {5, 5, 3, true, false},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::input_t(
+    {5, 7, 3, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
+     4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
+    {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, true, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, false, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, false, true}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 130, 5, false, true},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16},
+    {129, 0, 117, 116, 115}),
+  params_simple<float, int>::input_t(
+    {1, 130, 15, false, true},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
+
+using SimpleFloatInt = SelectK<float, int, params_simple>;
+TEST_P(SimpleFloatInt, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                // NOLINT
+  SelectK,
+  SimpleFloatInt,
+  testing::Combine(inputs_simple_f,
+                   testing::Values(select::Algo::kPublicApi,
+                                   select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed)));
+
+template <select::Algo RefAlgo>
+struct with_ref {
+  template <typename KeyT, typename IdxT>
+  struct params_random {
+    using io_t     = io_computed<KeyT, IdxT>;
+    using params_t = std::tuple<select::params, select::Algo>;
+
+    static auto read(params_t ps) -> Params<io_t>
+    {
+      auto spec = std::get<0>(ps);
+      auto algo = std::get<1>(ps);
+      std::vector<KeyT> dists(spec.len * spec.batch_size);
+
+      raft::device_resources handle;
+      {
+        auto s = handle.get_stream();
+        rmm::device_uvector<KeyT> dists_d(spec.len * spec.batch_size, s);
+        raft::random::RngState r(42);
+        normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
+        update_host(dists.data(), dists_d.data(), dists_d.size(), s);
+        s.synchronize();
+      }
+
+      return std::make_tuple(spec, algo, io_computed<KeyT, IdxT>(spec, RefAlgo, dists));
+    }
+  };
+};
+
+auto inputs_random_longlist = testing::Values(select::params{1, 130, 15, false},
+                                              select::params{1, 128, 15, false},
+                                              select::params{20, 700, 1, true},
+                                              select::params{20, 700, 2, true},
+                                              select::params{20, 700, 3, true},
+                                              select::params{20, 700, 4, true},
+                                              select::params{20, 700, 5, true},
+                                              select::params{20, 700, 6, true},
+                                              select::params{20, 700, 7, true},
+                                              select::params{20, 700, 8, true},
+                                              select::params{20, 700, 9, true},
+                                              select::params{20, 700, 10, true, false},
+                                              select::params{20, 700, 11, true},
+                                              select::params{20, 700, 12, true},
+                                              select::params{20, 700, 16, true},
+                                              select::params{100, 1700, 17, true},
+                                              select::params{100, 1700, 31, true, false},
+                                              select::params{100, 1700, 32, false},
+                                              select::params{100, 1700, 33, false},
+                                              select::params{100, 1700, 63, false},
+                                              select::params{100, 1700, 64, false, false},
+                                              select::params{100, 1700, 65, false},
+                                              select::params{100, 1700, 255, true},
+                                              select::params{100, 1700, 256, true},
+                                              select::params{100, 1700, 511, false},
+                                              select::params{100, 1700, 512, true},
+                                              select::params{100, 1700, 1023, false, false},
+                                              select::params{100, 1700, 1024, true},
+                                              select::params{100, 1700, 1700, true});
+
+auto inputs_random_largesize = testing::Values(select::params{100, 100000, 1, true},
+                                               select::params{100, 100000, 2, true},
+                                               select::params{100, 100000, 3, true, false},
+                                               select::params{100, 100000, 7, true},
+                                               select::params{100, 100000, 16, true},
+                                               select::params{100, 100000, 31, true},
+                                               select::params{100, 100000, 32, true, false},
+                                               select::params{100, 100000, 60, true},
+                                               select::params{100, 100000, 100, true, false},
+                                               select::params{100, 100000, 200, true},
+                                               select::params{100000, 100, 100, false},
+                                               select::params{1, 1000000000, 1, true},
+                                               select::params{1, 1000000000, 16, false, false},
+                                               select::params{1, 1000000000, 64, false},
+                                               select::params{1, 1000000000, 128, true, false},
+                                               select::params{1, 1000000000, 256, false, false});
+
+auto inputs_random_largek = testing::Values(select::params{100, 100000, 1000, true},
+                                            select::params{100, 100000, 2000, true},
+                                            select::params{100, 100000, 100000, true, false},
+                                            select::params{100, 100000, 2048, false},
+                                            select::params{100, 100000, 1237, true});
+
+using ReferencedRandomFloatInt =
+  SelectK<float, int, with_ref<select::Algo::kPublicApi>::params_random>;
+TEST_P(ReferencedRandomFloatInt, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                          // NOLINT
+  SelectK,
+  ReferencedRandomFloatInt,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed,
+                                   select::Algo::kWarpDistributedShm)));
+
+using ReferencedRandomDoubleSizeT =
+  SelectK<double, size_t, with_ref<select::Algo::kPublicApi>::params_random>;
+TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                             // NOLINT
+  SelectK,
+  ReferencedRandomDoubleSizeT,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed,
+                                   select::Algo::kWarpDistributedShm)));
+
+using ReferencedRandomDoubleInt =
+  SelectK<double, int, with_ref<select::Algo::kRadix11bits>::params_random>;
+TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                                 // NOLINT
+  SelectK,
+  ReferencedRandomDoubleInt,
+  testing::Combine(inputs_random_largesize, testing::Values(select::Algo::kWarpAuto)));
+
+using ReferencedRandomFloatSizeT =
+  SelectK<float, size_t, with_ref<select::Algo::kRadix8bits>::params_random>;
+TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(SelectK,                       // NOLINT
+                        ReferencedRandomFloatSizeT,
+                        testing::Combine(inputs_random_largek,
+                                         testing::Values(select::Algo::kRadix11bits)));
+
+}  // namespace raft::matrix
diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu
index 9060357b3f..58f849a87c 100644
--- a/cpp/test/matrix/slice.cu
+++ b/cpp/test/matrix/slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,7 +94,7 @@ class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SliceInputs<T> params;
diff --git a/cpp/test/matrix/triangular.cu b/cpp/test/matrix/triangular.cu
index 9c6c49066b..82b01181f5 100644
--- a/cpp/test/matrix/triangular.cu
+++ b/cpp/test/matrix/triangular.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ class TriangularTest : public ::testing::TestWithParam<TriangularInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   TriangularInputs<T> params;
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index 3285bc3496..8ccbe39889 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
@@ -78,16 +80,16 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     {
       rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
       rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-      naiveBfKnn<T, DataT, IdxT>(distances_naive_dev.data(),
-                                 indices_naive_dev.data(),
-                                 search_queries.data(),
-                                 database.data(),
-                                 ps.num_queries,
-                                 ps.num_db_vecs,
-                                 ps.dim,
-                                 ps.k,
-                                 ps.metric,
-                                 stream_);
+      naive_knn<T, DataT, IdxT>(distances_naive_dev.data(),
+                                indices_naive_dev.data(),
+                                search_queries.data(),
+                                database.data(),
+                                ps.num_queries,
+                                ps.num_db_vecs,
+                                ps.dim,
+                                ps.k,
+                                ps.metric,
+                                stream_);
       update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
       handle_.sync_stream(stream_);
@@ -107,8 +109,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         ivfParams.nprobe = ps.nprobe;
         ivfParams.nlist  = ps.nlist;
         raft::spatial::knn::knnIndex index;
-        index.index   = nullptr;
-        index.gpu_res = nullptr;
 
         approx_knn_build_index(handle_,
                                &index,
@@ -279,7 +279,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   }
 
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   rmm::cuda_stream_view stream_;
   AnnIvfFlatInputs<IdxT> ps;
   rmm::device_uvector<DataT> database;
@@ -294,6 +294,8 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
   {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
   {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true},
 
   // test dims that do not fit into kernel shared memory limits
   {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
@@ -332,16 +334,16 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    10000,
    16,
    10,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 2,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::L2Expanded,
    false},
   {1000,
    10000,
    16,
    10,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::InnerProduct,
    false}};
 
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 353e8b65e5..488041f527 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_pq.cuh>
@@ -42,15 +44,18 @@
 #include <algorithm>
 #include <cstddef>
 #include <iostream>
+#include <optional>
 #include <vector>
 
 namespace raft::neighbors::ivf_pq {
 
 struct ivf_pq_inputs {
-  uint32_t num_db_vecs = 4096;
-  uint32_t num_queries = 1024;
-  uint32_t dim         = 64;
-  uint32_t k           = 32;
+  uint32_t num_db_vecs             = 4096;
+  uint32_t num_queries             = 1024;
+  uint32_t dim                     = 64;
+  uint32_t k                       = 32;
+  std::optional<double> min_recall = std::nullopt;
+
   ivf_pq::index_params index_params;
   ivf_pq::search_params search_params;
 
@@ -91,6 +96,7 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   PRINT_DIFF(.num_queries);
   PRINT_DIFF(.dim);
   PRINT_DIFF(.k);
+  PRINT_DIFF_V(.min_recall, p.min_recall.value_or(0));
   PRINT_DIFF_V(.index_params.metric, print_metric{p.index_params.metric});
   PRINT_DIFF(.index_params.metric_arg);
   PRINT_DIFF(.index_params.add_data_on_build);
@@ -100,6 +106,7 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   PRINT_DIFF(.index_params.pq_bits);
   PRINT_DIFF(.index_params.pq_dim);
   PRINT_DIFF(.index_params.codebook_kind);
+  PRINT_DIFF(.index_params.force_random_rotation);
   PRINT_DIFF(.search_params.n_probes);
   PRINT_DIFF_V(.search_params.lut_dtype, print_dtype{p.search_params.lut_dtype});
   PRINT_DIFF_V(.search_params.internal_distance_dtype,
@@ -109,8 +116,9 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
 }
 
 template <typename IdxT>
-auto min_output_size(const handle_t& handle, const ivf_pq::index<IdxT>& index, uint32_t n_probes)
-  -> IdxT
+auto min_output_size(const raft::device_resources& handle,
+                     const ivf_pq::index<IdxT>& index,
+                     uint32_t n_probes) -> IdxT
 {
   uint32_t skip = index.n_nonempty_lists() > n_probes ? index.n_nonempty_lists() - n_probes : 0;
   auto map_type = [] __device__(uint32_t x) { return IdxT(x); };
@@ -134,8 +142,8 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
  protected:
   void gen_data()
   {
-    database.resize(ps.num_db_vecs * ps.dim, stream_);
-    search_queries.resize(ps.num_queries * ps.dim, stream_);
+    database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
+    search_queries.resize(size_t{ps.num_queries} * size_t{ps.dim}, stream_);
 
     raft::random::Rng r(1234ULL);
     if constexpr (std::is_same<DataT, float>{}) {
@@ -150,19 +158,19 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
 
   void calc_ref()
   {
-    size_t queries_size = ps.num_queries * ps.k;
+    size_t queries_size = size_t{ps.num_queries} * size_t{ps.k};
     rmm::device_uvector<EvalT> distances_naive_dev(queries_size, stream_);
     rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-    naiveBfKnn<EvalT, DataT, IdxT>(distances_naive_dev.data(),
-                                   indices_naive_dev.data(),
-                                   search_queries.data(),
-                                   database.data(),
-                                   ps.num_queries,
-                                   ps.num_db_vecs,
-                                   ps.dim,
-                                   ps.k,
-                                   ps.index_params.metric,
-                                   stream_);
+    naive_knn<EvalT, DataT, IdxT>(distances_naive_dev.data(),
+                                  indices_naive_dev.data(),
+                                  search_queries.data(),
+                                  database.data(),
+                                  ps.num_queries,
+                                  ps.num_db_vecs,
+                                  ps.dim,
+                                  ps.k,
+                                  ps.index_params.metric,
+                                  stream_);
     distances_ref.resize(queries_size);
     update_host(distances_ref.data(), distances_naive_dev.data(), queries_size, stream_);
     indices_ref.resize(queries_size);
@@ -231,12 +239,16 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     update_host(indices_ivf_pq.data(), indices_ivf_pq_dev.data(), queries_size, stream_);
     handle_.sync_stream(stream_);
 
-    // Using very dense, small codebooks results in large errors in the distance calculation
-    double low_precision_factor =
-      static_cast<double>(index.pq_dim() * index.pq_bits()) / static_cast<double>(ps.dim * 8);
     // A very conservative lower bound on recall
-    double min_recall = low_precision_factor * static_cast<double>(ps.search_params.n_probes) /
-                        static_cast<double>(ps.index_params.n_lists);
+    double min_recall =
+      static_cast<double>(ps.search_params.n_probes) / static_cast<double>(ps.index_params.n_lists);
+    double low_precision_factor =
+      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
+    // Using a heuristic to lower the required recall due to code-packing errors
+    min_recall =
+      std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall);
+    // Use explicit per-test min recall value if provided.
+    min_recall = ps.min_recall.value_or(min_recall);
 
     ASSERT_TRUE(eval_neighbours(indices_ref,
                                 indices_ivf_pq,
@@ -244,8 +256,9 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 distances_ivf_pq,
                                 ps.num_queries,
                                 ps.k,
-                                0.001 / low_precision_factor,
-                                min_recall));
+                                0.0001 * low_precision_factor,
+                                min_recall))
+      << ps;
 
     // Test a few extra invariants
     IdxT min_results = min_output_size(handle_, index, ps.search_params.n_probes);
@@ -296,7 +309,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   }
 
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   rmm::cuda_stream_view stream_;
   ivf_pq_inputs ps;                           // NOLINT
   rmm::device_uvector<DataT> database;        // NOLINT
@@ -350,9 +363,16 @@ inline auto small_dims_per_cluster() -> test_cases_t
 
 inline auto big_dims() -> test_cases_t
 {
-  return with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144});
-  // return with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144, 8192, 12288,
-  // 16384});
+  // with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144, 8192, 12288, 16384});
+  auto xs = with_dims({512, 513, 1023, 1024, 1025, 2048, 2049, 2050, 2053, 6144});
+  return map<ivf_pq_inputs>(xs, [](const ivf_pq_inputs& x) {
+    ivf_pq_inputs y(x);
+    uint32_t pq_len       = 2;
+    y.index_params.pq_dim = div_rounding_up_safe(x.dim, pq_len);
+    // This comes from pure experimentation, also the recall depens a lot on pq_len.
+    y.min_recall = 0.48 + 0.028 * std::log2(x.dim);
+    return y;
+  });
 }
 
 /** These will surely trigger no-smem-lut kernel.  */
@@ -360,8 +380,11 @@ inline auto big_dims_moderate_lut() -> test_cases_t
 {
   return map<ivf_pq_inputs>(big_dims(), [](const ivf_pq_inputs& x) {
     ivf_pq_inputs y(x);
+    uint32_t pq_len           = 2;
+    y.index_params.pq_dim     = round_up_safe(div_rounding_up_safe(x.dim, pq_len), 4u);
     y.index_params.pq_bits    = 6;
     y.search_params.lut_dtype = CUDA_R_16F;
+    y.min_recall              = 0.69;
     return y;
   });
 }
@@ -371,9 +394,11 @@ inline auto big_dims_small_lut() -> test_cases_t
 {
   return map<ivf_pq_inputs>(big_dims(), [](const ivf_pq_inputs& x) {
     ivf_pq_inputs y(x);
-    y.index_params.pq_dim     = raft::round_up_safe(y.dim / 8u, 64u);
+    uint32_t pq_len           = 8;
+    y.index_params.pq_dim     = round_up_safe(div_rounding_up_safe(x.dim, pq_len), 4u);
     y.index_params.pq_bits    = 6;
     y.search_params.lut_dtype = CUDA_R_8U;
+    y.min_recall              = 0.21;
     return y;
   });
 }
@@ -390,30 +415,68 @@ inline auto enum_variety() -> test_cases_t
     ([](ivf_pq_inputs & x) f)(xs[xs.size() - 1]); \
   } while (0);
 
-  ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER; });
-  ADD_CASE({ x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE; });
+  ADD_CASE({
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
+    x.min_recall                 = 0.86;
+  });
+  ADD_CASE({
+    x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_SUBSPACE;
+    x.min_recall                 = 0.86;
+  });
   ADD_CASE({
     x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
     x.index_params.pq_bits       = 4;
+    x.min_recall                 = 0.79;
   });
   ADD_CASE({
     x.index_params.codebook_kind = ivf_pq::codebook_gen::PER_CLUSTER;
     x.index_params.pq_bits       = 5;
+    x.min_recall                 = 0.83;
   });
 
-  ADD_CASE({ x.index_params.pq_bits = 6; });
-  ADD_CASE({ x.index_params.pq_bits = 7; });
-  ADD_CASE({ x.index_params.pq_bits = 8; });
+  ADD_CASE({
+    x.index_params.pq_bits = 6;
+    x.min_recall           = 0.84;
+  });
+  ADD_CASE({
+    x.index_params.pq_bits = 7;
+    x.min_recall           = 0.85;
+  });
+  ADD_CASE({
+    x.index_params.pq_bits = 8;
+    x.min_recall           = 0.86;
+  });
 
-  ADD_CASE({ x.index_params.force_random_rotation = true; });
-  ADD_CASE({ x.index_params.force_random_rotation = false; });
+  ADD_CASE({
+    x.index_params.force_random_rotation = true;
+    x.min_recall                         = 0.86;
+  });
+  ADD_CASE({
+    x.index_params.force_random_rotation = false;
+    x.min_recall                         = 0.86;
+  });
 
-  ADD_CASE({ x.search_params.lut_dtype = CUDA_R_32F; });
-  ADD_CASE({ x.search_params.lut_dtype = CUDA_R_16F; });
-  ADD_CASE({ x.search_params.lut_dtype = CUDA_R_8U; });
+  ADD_CASE({
+    x.search_params.lut_dtype = CUDA_R_32F;
+    x.min_recall              = 0.86;
+  });
+  ADD_CASE({
+    x.search_params.lut_dtype = CUDA_R_16F;
+    x.min_recall              = 0.86;
+  });
+  ADD_CASE({
+    x.search_params.lut_dtype = CUDA_R_8U;
+    x.min_recall              = 0.84;
+  });
 
-  ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_32F; });
-  ADD_CASE({ x.search_params.internal_distance_dtype = CUDA_R_16F; });
+  ADD_CASE({
+    x.search_params.internal_distance_dtype = CUDA_R_32F;
+    x.min_recall                            = 0.86;
+  });
+  ADD_CASE({
+    x.search_params.internal_distance_dtype = CUDA_R_16F;
+    x.min_recall                            = 0.86;
+  });
 
   return xs;
 }
@@ -431,11 +494,31 @@ inline auto enum_variety_ip() -> test_cases_t
 {
   return map<ivf_pq_inputs>(enum_variety(), [](const ivf_pq_inputs& x) {
     ivf_pq_inputs y(x);
+    if (y.min_recall.has_value()) {
+      if (y.search_params.lut_dtype == CUDA_R_8U) {
+        // InnerProduct score is signed,
+        // thus we're forced to used signed 8-bit representation,
+        // thus we have one bit less precision
+        y.min_recall = y.min_recall.value() * 0.90;
+      } else {
+        // In other cases it seems to perform a little bit better, still worse than L2
+        y.min_recall = y.min_recall.value() * 0.94;
+      }
+    }
     y.index_params.metric = distance::DistanceType::InnerProduct;
     return y;
   });
 }
 
+inline auto enum_variety_l2sqrt() -> test_cases_t
+{
+  return map<ivf_pq_inputs>(enum_variety(), [](const ivf_pq_inputs& x) {
+    ivf_pq_inputs y(x);
+    y.index_params.metric = distance::DistanceType::L2SqrtExpanded;
+    return y;
+  });
+}
+
 /**
  * Try different number of n_probes, some of which may trigger the non-fused version of the search
  * kernel.
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
index ecb2faa6a0..db42b1ee6a 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ using f32_f32_i64 = ivf_pq_test<float, float, int64_t>;
 
 TEST_BUILD_SEARCH(f32_f32_i64)
 TEST_BUILD_EXTEND_SEARCH(f32_f32_i64)
-INSTANTIATE(f32_f32_i64, enum_variety_l2() + enum_variety_ip() + big_dims_small_lut());
+INSTANTIATE(f32_f32_i64,
+            enum_variety_l2() + enum_variety_ip() + big_dims_small_lut() + enum_variety_l2sqrt());
 
 }  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 05fe6ab92d..4b07db32f4 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_k.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
-#include <raft/spatial/knn/detail/topk.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -103,90 +103,6 @@ inline auto operator<<(std::ostream& os, const print_metric& p) -> std::ostream&
   return os;
 }
 
-template <typename EvalT, typename DataT, typename IdxT>
-__global__ void naive_distance_kernel(EvalT* dist,
-                                      const DataT* x,
-                                      const DataT* y,
-                                      IdxT m,
-                                      IdxT n,
-                                      IdxT k,
-                                      raft::distance::DistanceType type)
-{
-  IdxT midx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (midx >= m) return;
-  for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n;
-       nidx += blockDim.y * gridDim.y) {
-    EvalT acc = EvalT(0);
-    for (IdxT i = 0; i < k; ++i) {
-      IdxT xidx = i + midx * k;
-      IdxT yidx = i + nidx * k;
-      EvalT xv  = (EvalT)x[xidx];
-      EvalT yv  = (EvalT)y[yidx];
-      if (type == raft::distance::DistanceType::InnerProduct) {
-        acc += xv * yv;
-      } else {
-        EvalT diff = xv - yv;
-        acc += diff * diff;
-      }
-    }
-    if (type == raft::distance::DistanceType::L2SqrtExpanded ||
-        type == raft::distance::DistanceType::L2SqrtUnexpanded)
-      acc = raft::mySqrt(acc);
-    dist[midx * n + nidx] = acc;
-  }
-}
-
-/**
- * TODO: either replace this with brute_force_knn or with distance+select_k
- *       when either distance or brute_force_knn support 8-bit int inputs.
- */
-template <typename EvalT, typename DataT, typename IdxT>
-void naiveBfKnn(EvalT* dist_topk,
-                IdxT* indices_topk,
-                const DataT* x,
-                const DataT* y,
-                size_t n_inputs,
-                size_t input_len,
-                size_t dim,
-                uint32_t k,
-                raft::distance::DistanceType type,
-                rmm::cuda_stream_view stream)
-{
-  rmm::mr::device_memory_resource* mr = nullptr;
-  auto pool_guard                     = raft::get_pool_memory_resource(mr, 1024 * 1024);
-
-  dim3 block_dim(16, 32, 1);
-  // maximum reasonable grid size in `y` direction
-  auto grid_y =
-    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
-
-  // bound the memory used by this function
-  size_t max_batch_size =
-    std::min<size_t>(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
-  rmm::device_uvector<EvalT> dist(max_batch_size * input_len, stream, mr);
-
-  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
-    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
-    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
-
-    naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
-      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
-
-    spatial::knn::detail::select_topk<EvalT, IdxT>(
-      dist.data(),
-      nullptr,
-      batch_size,
-      input_len,
-      static_cast<int>(k),
-      dist_topk + offset * k,
-      indices_topk + offset * k,
-      type != raft::distance::DistanceType::InnerProduct,
-      stream,
-      mr);
-  }
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
 template <typename IdxT, typename DistT, typename CompareDist>
 struct idx_dist_pair {
   IdxT idx;
@@ -232,18 +148,18 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
     }
   }
   double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
-  RAFT_LOG_INFO("Recall = %f (%zu/%zu)", actual_recall, match_count, total_count);
+  double error_margin  = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps);
+  RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).",
+                actual_recall,
+                match_count,
+                total_count,
+                std::abs(error_margin * 100.0),
+                error_margin < 0 ? "above" : "below",
+                eps);
   if (actual_recall < min_recall - eps) {
-    if (actual_recall < min_recall * min_recall - eps) {
-      RAFT_LOG_ERROR("Recall is much lower than the minimum (%f < %f)", actual_recall, min_recall);
-    } else {
-      RAFT_LOG_WARN("Recall is suspiciously too low (%f < %f)", actual_recall, min_recall);
-    }
-    if (match_count == 0 || actual_recall < min_recall * std::min(min_recall, 0.5) - eps) {
-      return testing::AssertionFailure()
-             << "actual recall (" << actual_recall
-             << ") is much smaller than the minimum expected recall (" << min_recall << ").";
-    }
+    return testing::AssertionFailure()
+           << "actual recall (" << actual_recall << ") is lower than the minimum expected recall ("
+           << min_recall << "); eps = " << eps << ". ";
   }
   return testing::AssertionSuccess();
 }
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
index 7405863b9f..a97df7df75 100644
--- a/cpp/test/neighbors/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,7 +101,7 @@ uint32_t count_discrepancies(value_idx* actual_idx,
 }
 
 template <typename value_t>
-void compute_bfknn(const raft::handle_t& handle,
+void compute_bfknn(const raft::device_resources& handle,
                    const value_t* X1,
                    const value_t* X2,
                    uint32_t n_rows,
@@ -152,7 +152,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs<va
   void basicTest()
   {
     params = ::testing::TestWithParam<BallCoverInputs<value_int>>::GetParam();
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     uint32_t k         = params.k;
     uint32_t n_centers = 25;
@@ -252,7 +252,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs<valu
   void basicTest()
   {
     params = ::testing::TestWithParam<BallCoverInputs<value_int>>::GetParam();
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     uint32_t k         = params.k;
     uint32_t n_centers = 25;
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu
index 4f33db489e..c78a15dd2d 100644
--- a/cpp/test/neighbors/epsilon_neighborhood.cu
+++ b/cpp/test/neighbors/epsilon_neighborhood.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,13 +72,13 @@ class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
                                 false);
   }
 
+  const raft::device_resources handle;
   EpsInputs<T, IdxT> param;
   cudaStream_t stream = 0;
   rmm::device_uvector<T> data;
   rmm::device_uvector<bool> adj;
   rmm::device_uvector<IdxT> labels, vd;
   IdxT batchSize;
-  const raft::handle_t handle;
 };  // class EpsNeighTest
 
 const std::vector<EpsInputs<float, int>> inputsfi = {
diff --git a/cpp/test/neighbors/faiss_mr.cu b/cpp/test/neighbors/faiss_mr.cu
index 38e793d120..5f0bcae933 100644
--- a/cpp/test/neighbors/faiss_mr.cu
+++ b/cpp/test/neighbors/faiss_mr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ class FAISS_MR_Test : public ::testing::TestWithParam<AllocInputs> {
     ASSERT_TRUE(free_after_alloc <= free_before - params_.size);
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
   AllocInputs params_;
 };
diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
index d57f99da50..981b3daa2b 100644
--- a/cpp/test/neighbors/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,15 +16,12 @@
 
 #include "../test_utils.cuh"
 
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/StandardGpuResources.h>
-
 #include <raft/core/device_mdspan.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/brute_force.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/knn.cuh>
+#include <raft/distance/distance.cuh>
 
 #if defined RAFT_NN_COMPILED
 #include <raft/neighbors/specializations.cuh>
@@ -112,8 +109,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
       search_queries(params_.num_queries * params_.dim, stream_),
       raft_indices_(params_.num_queries * params_.k, stream_),
       raft_distances_(params_.num_queries * params_.k, stream_),
-      faiss_indices_(params_.num_queries * params_.k, stream_),
-      faiss_distances_(params_.num_queries * params_.k, stream_)
+      ref_indices_(params_.num_queries * params_.k, stream_),
+      ref_distances_(params_.num_queries * params_.k, stream_)
   {
     RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_));
     RAFT_CUDA_TRY(
@@ -123,15 +120,32 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     RAFT_CUDA_TRY(
       cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_));
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
+      cudaMemsetAsync(ref_indices_.data(), 0, ref_indices_.size() * sizeof(int64_t), stream_));
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
+      cudaMemsetAsync(ref_distances_.data(), 0, ref_distances_.size() * sizeof(T), stream_));
   }
 
  protected:
   void testBruteForce()
   {
-    launchFaissBfknn();
+    // calculate the naive knn, by calculating the full pairwise distances and doing a k-select
+    rmm::device_uvector<T> temp_distances(num_db_vecs * num_queries, stream_);
+    distance::pairwise_distance(
+      handle_,
+      raft::make_device_matrix_view<T, int64_t>(search_queries.data(), num_queries, dim),
+      raft::make_device_matrix_view<T, int64_t>(database.data(), num_db_vecs, dim),
+      raft::make_device_matrix_view<T, int64_t>(temp_distances.data(), num_queries, num_db_vecs),
+      metric);
+
+    spatial::knn::select_k<int64_t, T>(temp_distances.data(),
+                                       nullptr,
+                                       num_queries,
+                                       num_db_vecs,
+                                       ref_distances_.data(),
+                                       ref_indices_.data(),
+                                       true,
+                                       k_,
+                                       stream_);
 
     auto index_view =
       raft::make_device_matrix_view<const T, int64_t>(database.data(), num_db_vecs, dim);
@@ -145,14 +159,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
       handle_, index_view, query_view, out_indices_view, out_dists_view, metric);
 
     // verify.
-    devArrMatchKnnPair(faiss_indices_.data(),
-                       raft_indices_.data(),
-                       faiss_distances_.data(),
-                       raft_distances_.data(),
-                       num_queries,
-                       k_,
-                       float(0.001),
-                       stream_);
+    ASSERT_TRUE(devArrMatchKnnPair(ref_indices_.data(),
+                                   raft_indices_.data(),
+                                   ref_distances_.data(),
+                                   raft_distances_.data(),
+                                   num_queries,
+                                   k_,
+                                   float(0.001),
+                                   stream_));
   }
 
   void SetUp() override
@@ -169,36 +183,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     uniform(handle_, r, search_queries.data(), num_queries * dim, T(-1.0), T(1.0));
   }
 
-  void launchFaissBfknn()
-  {
-    faiss::MetricType m = detail::build_faiss_metric(metric);
-
-    faiss::gpu::StandardGpuResources gpu_res;
-
-    gpu_res.noTempMemory();
-    int device;
-    RAFT_CUDA_TRY(cudaGetDevice(&device));
-    gpu_res.setDefaultStream(device, stream_);
-
-    faiss::gpu::GpuDistanceParams args;
-    args.metric          = m;
-    args.metricArg       = 0;
-    args.k               = k_;
-    args.dims            = dim;
-    args.vectors         = database.data();
-    args.vectorsRowMajor = true;
-    args.numVectors      = num_db_vecs;
-    args.queries         = search_queries.data();
-    args.queriesRowMajor = true;
-    args.numQueries      = num_queries;
-    args.outDistances    = faiss_distances_.data();
-    args.outIndices      = faiss_indices_.data();
-
-    bfKnn(&gpu_res, args);
-  }
-
  private:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   cudaStream_t stream_ = 0;
   FusedL2KNNInputs params_;
   int num_queries;
@@ -208,8 +194,8 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
   rmm::device_uvector<T> search_queries;
   rmm::device_uvector<int64_t> raft_indices_;
   rmm::device_uvector<T> raft_distances_;
-  rmm::device_uvector<int64_t> faiss_indices_;
-  rmm::device_uvector<T> faiss_distances_;
+  rmm::device_uvector<int64_t> ref_indices_;
+  rmm::device_uvector<T> ref_distances_;
   int k_;
   raft::distance::DistanceType metric;
 };
@@ -223,7 +209,6 @@ const std::vector<FusedL2KNNInputs> inputs = {
   {1000, 10000, 16, 50, raft::distance::DistanceType::L2Expanded},
   {1000, 10000, 32, 50, raft::distance::DistanceType::L2Expanded},
   {10000, 40000, 32, 30, raft::distance::DistanceType::L2Expanded},
-  {131072, 131072, 8, 60, raft::distance::DistanceType::L2Expanded},
   // L2 unexpanded
   {100, 1000, 16, 10, raft::distance::DistanceType::L2Unexpanded},
   {1000, 10000, 16, 10, raft::distance::DistanceType::L2Unexpanded},
@@ -232,7 +217,7 @@ const std::vector<FusedL2KNNInputs> inputs = {
   {1000, 10000, 16, 50, raft::distance::DistanceType::L2Unexpanded},
   {1000, 10000, 32, 50, raft::distance::DistanceType::L2Unexpanded},
   {10000, 40000, 32, 30, raft::distance::DistanceType::L2Unexpanded},
-  {131072, 131072, 8, 60, raft::distance::DistanceType::L2Unexpanded}};
+};
 
 typedef FusedL2KNNTest<float> FusedL2KNNTestF;
 TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); }
diff --git a/cpp/test/neighbors/haversine.cu b/cpp/test/neighbors/haversine.cu
index 91a2ca07df..dc5c8afe18 100644
--- a/cpp/test/neighbors/haversine.cu
+++ b/cpp/test/neighbors/haversine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,7 +100,7 @@ class HaversineKNNTest : public ::testing::Test {
   void SetUp() override { basicTest(); }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<value_t> d_train_inputs;
diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu
index ff3a6a80b4..6814d47dcb 100644
--- a/cpp/test/neighbors/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -154,7 +154,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   }
 
  private:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   KNNInputs params_;
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index 674171e030..a78f5cfe5c 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
-#include "refine_helper.cuh"
+#include <raft_internal/neighbors/refine_helper.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/detail/refine.cuh>
@@ -31,7 +31,7 @@
 
 #include <gtest/gtest.h>
 
-#if defined RAFT_NN_COMPILED
+#if defined RAFT_DISTANCE_COMPILED
 #include <raft/neighbors/specializations.cuh>
 #endif
 
@@ -40,11 +40,11 @@
 namespace raft::neighbors {
 
 template <typename DataT, typename DistanceT, typename IdxT>
-class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
+class RefineTest : public ::testing::TestWithParam<RefineInputs<IdxT>> {
  public:
   RefineTest()
     : stream_(handle_.get_stream()),
-      data(handle_, ::testing::TestWithParam<detail::RefineInputs<IdxT>>::GetParam())
+      data(handle_, ::testing::TestWithParam<RefineInputs<IdxT>>::GetParam())
   {
   }
 
@@ -102,31 +102,31 @@ class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
   }
 
  public:
-  raft::handle_t handle_;
+  raft::device_resources handle_;
   rmm::cuda_stream_view stream_;
-  detail::RefineHelper<DataT, DistanceT, IdxT> data;
+  RefineHelper<DataT, DistanceT, IdxT> data;
 };
 
-const std::vector<detail::RefineInputs<int64_t>> inputs =
-  raft::util::itertools::product<detail::RefineInputs<int64_t>>(
-    {137},
-    {1000},
-    {16},
-    {1, 10, 33},
-    {33},
+const std::vector<RefineInputs<uint64_t>> inputs =
+  raft::util::itertools::product<RefineInputs<uint64_t>>(
+    {static_cast<uint64_t>(137)},
+    {static_cast<uint64_t>(1000)},
+    {static_cast<uint64_t>(16)},
+    {static_cast<uint64_t>(1), static_cast<uint64_t>(10), static_cast<uint64_t>(33)},
+    {static_cast<uint64_t>(33)},
     {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false, true});
 
-typedef RefineTest<float, float, std::int64_t> RefineTestF;
+typedef RefineTest<float, float, std::uint64_t> RefineTestF;
 TEST_P(RefineTestF, AnnRefine) { this->testRefine(); }
 
 INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF, ::testing::ValuesIn(inputs));
 
-typedef RefineTest<uint8_t, float, std::int64_t> RefineTestF_uint8;
+typedef RefineTest<uint8_t, float, std::uint64_t> RefineTestF_uint8;
 TEST_P(RefineTestF_uint8, AnnRefine) { this->testRefine(); }
 INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF_uint8, ::testing::ValuesIn(inputs));
 
-typedef RefineTest<int8_t, float, std::int64_t> RefineTestF_int8;
+typedef RefineTest<int8_t, float, std::uint64_t> RefineTestF_int8;
 TEST_P(RefineTestF_int8, AnnRefine) { this->testRefine(); }
 INSTANTIATE_TEST_CASE_P(RefineTest, RefineTestF_int8, ::testing::ValuesIn(inputs));
 }  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index d793ea46ee..61a6345e5e 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,10 +49,11 @@ std::ostream& operator<<(std::ostream& os, const SelectTestSpec& ss)
 }
 
 template <typename IdxT>
-auto gen_simple_ids(int n_inputs, int input_len) -> std::vector<IdxT>
+auto gen_simple_ids(int n_inputs, int input_len, const raft::device_resources& handle)
+  -> std::vector<IdxT>
 {
   std::vector<IdxT> out(n_inputs * input_len);
-  auto s = rmm::cuda_stream_default;
+  auto s = handle.get_stream();
   rmm::device_uvector<IdxT> out_d(out.size(), s);
   iota_fill(out_d.data(), IdxT(n_inputs), IdxT(input_len), s);
   update_host(out.data(), out_d.data(), out.size(), s);
@@ -65,14 +66,16 @@ struct SelectInOutSimple {
  public:
   bool not_supported = false;
 
-  SelectInOutSimple(const SelectTestSpec& spec,
+  SelectInOutSimple(std::shared_ptr<raft::device_resources> handle,
+                    const SelectTestSpec& spec,
                     const std::vector<KeyT>& in_dists,
                     const std::vector<KeyT>& out_dists,
                     const std::vector<IdxT>& out_ids)
     : in_dists_(in_dists),
-      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len)),
+      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len, *handle.get())),
       out_dists_(out_dists),
-      out_ids_(out_ids)
+      out_ids_(out_ids),
+      handle_(handle)
   {
   }
 
@@ -82,6 +85,7 @@ struct SelectInOutSimple {
   auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
 
  private:
+  std::shared_ptr<raft::device_resources> handle_;
   std::vector<KeyT> in_dists_;
   std::vector<IdxT> in_ids_;
   std::vector<KeyT> out_dists_;
@@ -93,19 +97,22 @@ struct SelectInOutComputed {
  public:
   bool not_supported = false;
 
-  SelectInOutComputed(const SelectTestSpec& spec,
+  SelectInOutComputed(std::shared_ptr<raft::device_resources> handle,
+                      const SelectTestSpec& spec,
                       knn::SelectKAlgo algo,
                       const std::vector<KeyT>& in_dists,
                       const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
-    : in_dists_(in_dists),
-      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len))),
+    : handle_(handle),
+      in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len, *handle.get()))),
       out_dists_(spec.n_inputs * spec.k),
       out_ids_(spec.n_inputs * spec.k)
+
   {
     // check if the size is supported by the algorithm
     switch (algo) {
       case knn::SelectKAlgo::WARP_SORT:
-        if (spec.k > raft::spatial::knn::detail::topk::kMaxCapacity) {
+        if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) {
           not_supported = true;
           return;
         }
@@ -119,7 +126,7 @@ struct SelectInOutComputed {
       default: break;
     }
 
-    auto stream = rmm::cuda_stream_default;
+    auto stream = handle_.get()->get_stream();
 
     rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
     rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
@@ -156,6 +163,7 @@ struct SelectInOutComputed {
   auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
 
  private:
+  std::shared_ptr<raft::device_resources> handle_;
   std::vector<KeyT> in_dists_;
   std::vector<IdxT> in_ids_;
   std::vector<KeyT> out_dists_;
@@ -205,11 +213,13 @@ struct SelectInOutComputed {
 };
 
 template <typename InOut>
-using Params = std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut>;
+using Params =
+  std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut, std::shared_ptr<raft::device_resources>>;
 
 template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
 class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn> {
  protected:
+  std::shared_ptr<raft::device_resources> handle_;
   const SelectTestSpec spec;
   const knn::SelectKAlgo algo;
 
@@ -218,10 +228,11 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
 
  public:
   explicit SelectionTest(Params<typename ParamsReader<KeyT, IdxT>::InOut> ps)
-    : spec(std::get<0>(ps)),
+    : handle_(std::get<3>(ps)),
+      spec(std::get<0>(ps)),
       algo(std::get<1>(ps)),
       ref(std::get<2>(ps)),
-      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())
+      res(handle_, spec, algo, ref.get_in_dists(), ref.get_in_ids())
   {
   }
 
@@ -238,12 +249,13 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
   void run()
   {
     if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
-    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
 
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
     // If the dists (keys) are the same, different corresponding ids may end up in the selection due
     // to non-deterministic nature of some implementations.
-    auto& in_ids     = ref.get_in_ids();
-    auto& in_dists   = ref.get_in_dists();
+    auto& in_ids   = ref.get_in_ids();
+    auto& in_dists = ref.get_in_dists();
+
     auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
       if (i == j) return true;
       auto ix_i = size_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
@@ -265,17 +277,20 @@ struct params_simple {
   using InOut = SelectInOutSimple<KeyT, IdxT>;
   using Inputs =
     std::tuple<SelectTestSpec, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
-  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo>;
+  using Handle   = std::shared_ptr<raft::device_resources>;
+  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo, Handle>;
 
   static auto read(ParamsIn ps) -> Params<InOut>
   {
-    auto ins  = std::get<0>(ps);
-    auto algo = std::get<1>(ps);
+    auto ins    = std::get<0>(ps);
+    auto algo   = std::get<1>(ps);
+    auto handle = std::get<2>(ps);
     return std::make_tuple(
       std::get<0>(ins),
       algo,
       SelectInOutSimple<KeyT, IdxT>(
-        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+        handle, std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)),
+      handle);
   }
 };
 
@@ -339,38 +354,43 @@ auto inputs_simple_f = testing::Values(
 
 typedef SelectionTest<float, int, params_simple> SimpleFloatInt;
 TEST_P(SimpleFloatInt, Run) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        SimpleFloatInt,
-                        testing::Combine(inputs_simple_f,
-                                         testing::Values(knn::SelectKAlgo::FAISS,
-                                                         knn::SelectKAlgo::RADIX_8_BITS,
-                                                         knn::SelectKAlgo::RADIX_11_BITS,
-                                                         knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  SimpleFloatInt,
+  testing::Combine(inputs_simple_f,
+                   testing::Values(knn::SelectKAlgo::FAISS,
+                                   knn::SelectKAlgo::RADIX_8_BITS,
+                                   knn::SelectKAlgo::RADIX_11_BITS,
+                                   knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 template <knn::SelectKAlgo RefAlgo>
 struct with_ref {
   template <typename KeyT, typename IdxT>
   struct params_random {
     using InOut    = SelectInOutComputed<KeyT, IdxT>;
-    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo>;
+    using Handle   = std::shared_ptr<raft::device_resources>;
+    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo, Handle>;
 
     static auto read(ParamsIn ps) -> Params<InOut>
     {
-      auto spec = std::get<0>(ps);
-      auto algo = std::get<1>(ps);
+      auto spec   = std::get<0>(ps);
+      auto algo   = std::get<1>(ps);
+      auto handle = std::get<2>(ps);
+
       std::vector<KeyT> dists(spec.input_len * spec.n_inputs);
 
-      raft::handle_t handle;
       {
-        auto s = handle.get_stream();
+        auto s = (*handle.get()).get_stream();
         rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
         raft::random::RngState r(42);
-        normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
+        normal(*(handle.get()), r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
         update_host(dists.data(), dists_d.data(), dists_d.size(), s);
         s.synchronize();
       }
 
-      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, RefAlgo, dists));
+      return std::make_tuple(
+        spec, algo, SelectInOutComputed<KeyT, IdxT>(handle, spec, RefAlgo, dists), handle);
     }
   };
 };
@@ -416,11 +436,11 @@ auto inputs_random_largesize = testing::Values(SelectTestSpec{100, 100000, 1, tr
                                                SelectTestSpec{100, 100000, 100, true, false},
                                                SelectTestSpec{100, 100000, 200, true},
                                                SelectTestSpec{100000, 100, 100, false},
-                                               SelectTestSpec{1, 1000000000, 1, true},
-                                               SelectTestSpec{1, 1000000000, 16, false, false},
-                                               SelectTestSpec{1, 1000000000, 64, false},
-                                               SelectTestSpec{1, 1000000000, 128, true, false},
-                                               SelectTestSpec{1, 1000000000, 256, false, false});
+                                               SelectTestSpec{1, 100000000, 1, true},
+                                               SelectTestSpec{1, 100000000, 16, false, false},
+                                               SelectTestSpec{1, 100000000, 64, false},
+                                               SelectTestSpec{1, 100000000, 128, true, false},
+                                               SelectTestSpec{1, 100000000, 256, false, false});
 
 auto inputs_random_largek = testing::Values(SelectTestSpec{100, 100000, 1000, true},
                                             SelectTestSpec{100, 100000, 2000, true},
@@ -431,30 +451,36 @@ auto inputs_random_largek = testing::Values(SelectTestSpec{100, 100000, 1000, tr
 typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomFloatInt;
 TEST_P(ReferencedRandomFloatInt, Run) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        ReferencedRandomFloatInt,
-                        testing::Combine(inputs_random_longlist,
-                                         testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
-                                                         knn::SelectKAlgo::RADIX_11_BITS,
-                                                         knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  ReferencedRandomFloatInt,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
+                                   knn::SelectKAlgo::RADIX_11_BITS,
+                                   knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 typedef SelectionTest<double, size_t, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomDoubleSizeT;
 TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        ReferencedRandomDoubleSizeT,
-                        testing::Combine(inputs_random_longlist,
-                                         testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
-                                                         knn::SelectKAlgo::RADIX_11_BITS,
-                                                         knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  ReferencedRandomDoubleSizeT,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(knn::SelectKAlgo::RADIX_8_BITS,
+                                   knn::SelectKAlgo::RADIX_11_BITS,
+                                   knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomDoubleInt;
 TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); }
-INSTANTIATE_TEST_CASE_P(SelectionTest,
-                        ReferencedRandomDoubleInt,
-                        testing::Combine(inputs_random_largesize,
-                                         testing::Values(knn::SelectKAlgo::WARP_SORT)));
+INSTANTIATE_TEST_CASE_P(
+  SelectionTest,
+  ReferencedRandomDoubleInt,
+  testing::Combine(inputs_random_largesize,
+                   testing::Values(knn::SelectKAlgo::WARP_SORT),
+                   testing::Values(std::make_shared<raft::device_resources>())));
 
 /** TODO: Fix test failure in RAFT CI
  *
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 741b374c8c..c2dbc5dc1c 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,8 +147,8 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   MakeBlobsInputs<T> params;
-  raft::handle_t handle;
   cudaStream_t stream = 0;
 
   device_vector<T, int> mean_var;
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
index 960ecc11d9..7508b57bdd 100644
--- a/cpp/test/random/make_regression.cu
+++ b/cpp/test/random/make_regression.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,7 +117,7 @@ class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
 
   MakeRegressionInputs<T> params;
@@ -251,7 +251,7 @@ class MakeRegressionMdspanTest : public ::testing::TestWithParam<MakeRegressionI
 
  private:
   MakeRegressionInputs<T> params{::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()};
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::device_uvector<T> values_ret{params.n_samples * params.n_targets, handle.get_stream()};
   rmm::device_uvector<T> values_prod{params.n_samples * params.n_targets, handle.get_stream()};
   int zero_count = -1;
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index 04626a53c7..1aa8b6a555 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,9 +79,10 @@ template <typename T>
 
 template <typename T>
 class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
- protected:
+ public:
   MVGTest()
-    : workspace_d(0, handle.get_stream()),
+    : params(::testing::TestWithParam<MVGInputs<T>>::GetParam()),
+      workspace_d(0, handle.get_stream()),
       P_d(0, handle.get_stream()),
       x_d(0, handle.get_stream()),
       X_d(0, handle.get_stream()),
@@ -90,6 +91,7 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
   {
   }
 
+ protected:
   void SetUp() override
   {
     // getting params
@@ -195,15 +197,15 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   MVGInputs<T> params;
-  std::vector<T> P, x, X;
   rmm::device_uvector<T> workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean;
+  std::vector<T> P, x, X;
   int dim, nPoints;
   typename detail::multi_variable_gaussian<T>::Decomposer method;
   Correlation corr;
   detail::multi_variable_gaussian<T>* mvg = NULL;
   T tolerance;
-  raft::handle_t handle;
 };  // end of MVGTest class
 
 template <typename T>
@@ -220,7 +222,7 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
     }
   }
 
- protected:
+ public:
   MVGMdspanTest()
     : workspace_d(0, handle.get_stream()),
       P_d(0, handle.get_stream()),
@@ -323,13 +325,14 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
+
   MVGInputs<T> params;
   std::vector<T> P, x, X;
   rmm::device_uvector<T> workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean;
   int dim, nPoints;
   Correlation corr;
   T tolerance;
-  raft::handle_t handle;
 };  // end of MVGTest class
 
 ///@todo find out the reason that Un-correlated covs are giving problems (in qr)
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
index be4f2a005f..d5fcca270e 100644
--- a/cpp/test/random/permute.cu
+++ b/cpp/test/random/permute.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ class PermTest : public ::testing::TestWithParam<PermInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   PermInputs<T> params;
   rmm::device_uvector<T> in, out;
   T* in_ptr  = nullptr;
@@ -158,7 +158,7 @@ class PermMdspanTest : public ::testing::TestWithParam<PermInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   PermInputs<T> params;
   rmm::device_uvector<T> in, out;
   T* in_ptr  = nullptr;
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu
index c1c4752453..aae3898389 100644
--- a/cpp/test/random/rmat_rectangular_generator.cu
+++ b/cpp/test/random/rmat_rectangular_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -242,7 +242,7 @@ class RmatGenTest : public ::testing::TestWithParam<RmatInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RmatInputs params;
@@ -347,7 +347,7 @@ class RmatGenMdspanTest : public ::testing::TestWithParam<RmatInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RmatInputs params;
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index bdce79b76e..d3b8e44b05 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,8 +145,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
       case RNG_LogNormal: {
         auto var   = params.end * params.end;
         auto mu    = params.start;
-        meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[0] = raft::exp(mu + var * T(0.5));
+        meanvar[1] = (raft::exp(var) - T(1.0)) * raft::exp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -169,7 +169,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         meanvar[1] = meanvar[0] * meanvar[0];
         break;
       case RNG_Rayleigh:
-        meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
+        meanvar[0] = params.start * raft::sqrt(T(3.1415 / 2.0));
         meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
@@ -180,7 +180,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
@@ -239,8 +239,8 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
       case RNG_LogNormal: {
         auto var   = params.end * params.end;
         auto mu    = params.start;
-        meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[0] = raft::exp(mu + var * T(0.5));
+        meanvar[1] = (raft::exp(var) - T(1.0)) * raft::exp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -263,7 +263,7 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
         meanvar[1] = meanvar[0] * meanvar[0];
         break;
       case RNG_Rayleigh:
-        meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
+        meanvar[0] = params.start * raft::sqrt(T(3.1415 / 2.0));
         meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
@@ -274,7 +274,7 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
@@ -391,7 +391,7 @@ TEST(Rng, MeanError)
   int num_experiments = 1024;
   int len             = num_samples * num_experiments;
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   auto stream = handle.get_stream();
 
   rmm::device_uvector<float> data(len, stream);
@@ -458,7 +458,7 @@ class ScaledBernoulliTest : public ::testing::Test {
       h_data.get(), h_data.get() + len, [](const T& a) { return a < -scale || a > scale; }));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<T> data;
@@ -487,7 +487,7 @@ class ScaledBernoulliMdspanTest : public ::testing::Test {
       h_data.get(), h_data.get() + len, [](const T& a) { return a < -scale || a > scale; }));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<T> data;
@@ -528,7 +528,7 @@ class BernoulliTest : public ::testing::Test {
     delete[] h_data;
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<bool> data;
@@ -559,7 +559,7 @@ class BernoulliMdspanTest : public ::testing::Test {
     ASSERT_TRUE(std::any_of(h_data.get(), h_data.get() + len, [](bool a) { return !a; }));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   rmm::device_uvector<bool> data;
@@ -635,7 +635,7 @@ class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngNormalTableInputs<T> params;
@@ -691,7 +691,7 @@ class RngNormalTableMdspanTest : public ::testing::TestWithParam<RngNormalTableI
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngNormalTableInputs<T> params;
diff --git a/cpp/test/random/rng_discrete.cu b/cpp/test/random/rng_discrete.cu
index 2704bac885..741f7c65e0 100644
--- a/cpp/test/random/rng_discrete.cu
+++ b/cpp/test/random/rng_discrete.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,7 +166,7 @@ class RngDiscreteTest : public ::testing::TestWithParam<RngDiscreteInputs<IdxT>>
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngDiscreteInputs<IdxT> params;
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 89d6d208a5..83300b3ecc 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
@@ -165,7 +165,7 @@ class RngMdspanTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   RngInputs<T> params;
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index c9b4dd0879..ae5a58da3d 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SWoRInputs<T> params;
@@ -145,7 +145,7 @@ class SWoRMdspanTest : public ::testing::TestWithParam<SWoRInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SWoRInputs<T> params;
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 692094a861..eb10432f3d 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/linalg/add.cuh>
 
@@ -126,7 +126,7 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRAddInputs<Type_f, Index_> params;
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 21f81bfa6a..ad91d0d284 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/csr.hpp>
 
@@ -66,7 +66,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRtoCOOInputs<Index_> params;
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index bc81dcaba5..71d296f665 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -181,7 +181,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRAdjGraphInputs<index_t> params;
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 6b10f8d798..73b8691774 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <cusparse_v2.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <gtest/gtest.h>
@@ -142,7 +142,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   // input data
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index b99d6fa7fd..39a6cc4164 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <cusparse_v2.h>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <gtest/gtest.h>
@@ -116,7 +116,7 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
   }
 
  protected:
-  raft::handle_t raft_handle;
+  raft::device_resources raft_handle;
   cudaStream_t stream;
 
   cusparseHandle_t handle;
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 2342f6e7ef..812c3defea 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/linalg/transpose.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -101,7 +101,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
 
   void SetUp() override
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     make_data();
 
@@ -135,7 +135,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
   }
 
  protected:
-  raft::handle_t raft_handle;
+  raft::device_resources raft_handle;
   cudaStream_t stream;
 
   cusparseHandle_t handle;
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index cf846440b3..e768e49f6c 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -235,7 +235,7 @@ class SparseDistanceCOOSPMVTest
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   // input data
   rmm::device_uvector<value_idx> indptr, indices;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 367bfddad1..2a973d675c 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ class SparseDistanceTest
                   dist_config.handle.get_stream());
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   // input data
   rmm::device_uvector<value_idx> indptr, indices;
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 6ada5ddcad..8c106f8868 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 typedef SparseFilterTests<float> COORemoveZeros;
 TEST_P(COORemoveZeros, Result)
 {
-  raft::handle_t h;
+  raft::device_resources h;
   auto stream = h.get_stream();
   params      = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
diff --git a/cpp/test/mst.cu b/cpp/test/sparse/mst.cu
similarity index 98%
rename from cpp/test/mst.cu
rename to cpp/test/sparse/mst.cu
index d11f0b5842..0a80846440 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/sparse/mst.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 
 #include <bits/stdc++.h>
 
-#include "test_utils.cuh"
+#include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -241,7 +241,7 @@ class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, w
   edge_t e;
   int iterations;
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 };
 
 // connected components tests
diff --git a/cpp/test/sparse/neighbors/brute_force.cu b/cpp/test/sparse/neighbors/brute_force.cu
index 96ba3bc48f..49284a498b 100644
--- a/cpp/test/sparse/neighbors/brute_force.cu
+++ b/cpp/test/sparse/neighbors/brute_force.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,7 +140,7 @@ class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx,
     out_indices.resize(n_rows * k, stream);
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
 
   int n_rows, nnz, k;
 
diff --git a/cpp/test/sparse/neighbors/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu
index f469cc1aa2..d200744329 100644
--- a/cpp/test/sparse/neighbors/connect_components.cu
+++ b/cpp/test/sparse/neighbors/connect_components.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ class ConnectComponentsTest
  protected:
   void basicTest()
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     auto stream = handle.get_stream();
 
diff --git a/cpp/test/sparse/neighbors/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
index c3700a536c..3b025fc082 100644
--- a/cpp/test/sparse/neighbors/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
   void TearDown() override { delete out; }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   // input data
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 1a69acc535..91b7b09fcc 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "../test_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -81,7 +81,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRRowNormalizeInputs<Type_f, Index_> params;
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 5752624435..6dc67dbbd8 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "../test_utils.cuh"
 #include <iostream>
 #include <limits>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/reduce.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -51,7 +51,7 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
 
   void Run()
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
 
     auto stream = handle.get_stream();
 
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 6393c5ee86..e09af0d9ff 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Inde
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   CSRRowOpInputs<Type_f, Index_> params;
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 23c2f5b67a..319c96bc02 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ TEST_P(COOSort, Result)
 {
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::RngState r(params.seed);
-  raft::handle_t h;
+  raft::device_resources h;
   auto stream = h.get_stream();
 
   rmm::device_uvector<int> in_rows(params.nnz, stream);
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/sparse/spectral_matrix.cu
similarity index 95%
rename from cpp/test/spectral_matrix.cu
rename to cpp/test/sparse/spectral_matrix.cu
index 867b1e9daf..3b044e3974 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/sparse/spectral_matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 
 #include <raft/spectral/matrix_wrappers.hpp>
 
@@ -39,7 +39,7 @@ TEST(Raft, SpectralMatrices)
   using index_type = int;
   using value_type = double;
 
-  handle_t h;
+  raft::device_resources h;
   ASSERT_EQ(0, h.get_device());
 
   csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
diff --git a/cpp/test/sparse/spgemmi.cu b/cpp/test/sparse/spgemmi.cu
index 653c2fa29b..ec77b8e88b 100644
--- a/cpp/test/sparse/spgemmi.cu
+++ b/cpp/test/sparse/spgemmi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "../test_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/transpose.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cudart_utils.hpp>
@@ -120,7 +120,7 @@ class SPGemmiTest : public ::testing::TestWithParam<SPGemmiInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SPGemmiInputs params;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 6f2f877304..80a512a019 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -114,7 +114,7 @@ class SparseSymmetrizeTest
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   // input data
diff --git a/cpp/test/stats/accuracy.cu b/cpp/test/stats/accuracy.cu
index eaccdecab4..543b99bda0 100644
--- a/cpp/test/stats/accuracy.cu
+++ b/cpp/test/stats/accuracy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ class AccuracyTest : public ::testing::TestWithParam<AccuracyInputs<T>> {
 
  protected:
   AccuracyInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   T expectedVal, actualVal;
 };
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/test/stats/adjusted_rand_index.cu
index 52bc72174a..4506a6730a 100644
--- a/cpp/test/stats/adjusted_rand_index.cu
+++ b/cpp/test/stats/adjusted_rand_index.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -137,7 +137,7 @@ class adjustedRandIndexTest : public ::testing::TestWithParam<adjustedRandIndexP
     truth_adjusted_rand_index = 1.0;
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   adjustedRandIndexParam params;
   T lowerLabelRange, upperLabelRange;
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/test/stats/completeness_score.cu
index a9d1748f88..a2a926d41d 100644
--- a/cpp/test/stats/completeness_score.cu
+++ b/cpp/test/stats/completeness_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,7 +100,7 @@ class completenessTest : public ::testing::TestWithParam<completenessParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   completenessParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements               = 0;
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/test/stats/contingencyMatrix.cu
index d27114388e..f344b9ae71 100644
--- a/cpp/test/stats/contingencyMatrix.cu
+++ b/cpp/test/stats/contingencyMatrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -135,7 +135,7 @@ class ContingencyMatrixTest : public ::testing::TestWithParam<ContingencyMatrixP
                                   raft::Compare<T>()));
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   ContingencyMatrixParam params;
   int numUniqueClasses = -1;
   T minLabel, maxLabel;
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 59a2c6e081..c8a90b2f7d 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
 
   void SetUp() override
   {
-    raft::handle_t handle;
+    raft::device_resources handle;
     cudaStream_t stream = handle.get_stream();
 
     params = ::testing::TestWithParam<CovInputs<T>>::GetParam();
@@ -103,10 +103,10 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
   }
 
  protected:
-  CovInputs<T> params;
-  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
   cublasHandle_t handle;
   cudaStream_t stream = 0;
+  CovInputs<T> params;
+  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
 };
 
 ///@todo: add stable=false after it has been implemented
diff --git a/cpp/test/stats/dispersion.cu b/cpp/test/stats/dispersion.cu
index e414fcf5f4..261e66af52 100644
--- a/cpp/test/stats/dispersion.cu
+++ b/cpp/test/stats/dispersion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,7 +95,7 @@ class DispersionTest : public ::testing::TestWithParam<DispersionInputs<T>> {
 
  protected:
   DispersionInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   rmm::device_uvector<T> exp_mean, act_mean;
   cudaStream_t stream = 0;
   int npoints;
diff --git a/cpp/test/stats/entropy.cu b/cpp/test/stats/entropy.cu
index 96b2b9f590..f19da32bb0 100644
--- a/cpp/test/stats/entropy.cu
+++ b/cpp/test/stats/entropy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ class entropyTest : public ::testing::TestWithParam<entropyParam> {
                            upperLabelRange);
   }
 
-  raft::handle_t handle;
+  raft::device_resources handle;
   // declaring the data values
   entropyParam params;
   T lowerLabelRange, upperLabelRange;
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index 76677ac27c..b3f9fe6782 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -94,7 +94,7 @@ class HistTest : public ::testing::TestWithParam<HistInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   HistInputs params;
   rmm::device_uvector<int> in, bins, ref_bins;
 };
@@ -131,7 +131,7 @@ class HistMdspanTest : public ::testing::TestWithParam<HistInputs> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   HistInputs params;
   rmm::device_uvector<int> in, bins, ref_bins;
 };
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/test/stats/homogeneity_score.cu
index ecbf160770..1b48bb1823 100644
--- a/cpp/test/stats/homogeneity_score.cu
+++ b/cpp/test/stats/homogeneity_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ class homogeneityTest : public ::testing::TestWithParam<homogeneityParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   homogeneityParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements              = 0;
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu
index 2cfbd787c6..45804c6724 100644
--- a/cpp/test/stats/information_criterion.cu
+++ b/cpp/test/stats/information_criterion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <raft/stats/information_criterion.cuh>
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -109,7 +109,7 @@ class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   BatchedICInputs<T> params;
   rmm::device_uvector<T> res_d;
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/test/stats/kl_divergence.cu
index b5a6c393f3..15eac6428a 100644
--- a/cpp/test/stats/kl_divergence.cu
+++ b/cpp/test/stats/kl_divergence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ class klDivergenceTest : public ::testing::TestWithParam<klDivergenceParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   klDivergenceParam params;
   int nElements              = 0;
   DataT truthklDivergence    = 0;
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 19398d6d8e..4d011a2425 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,7 +81,7 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MeanInputs<T> params;
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 31947ef527..e5e01a2b10 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MeanCenterInputs<T, IdxType> params;
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
index fb9fc13dec..d21ec43bba 100644
--- a/cpp/test/stats/meanvar.cu
+++ b/cpp/test/stats/meanvar.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,7 +89,7 @@ class MeanVarTest : public ::testing::TestWithParam<MeanVarInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   MeanVarInputs<T> params;
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 1171995d5c..8b58f9692a 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,7 +130,7 @@ class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   MinMaxInputs<T> params;
   rmm::device_uvector<T> minmax_act;
   rmm::device_uvector<T> minmax_ref;
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/test/stats/mutual_info_score.cu
index 8b6e7b2095..1b4ce26746 100644
--- a/cpp/test/stats/mutual_info_score.cu
+++ b/cpp/test/stats/mutual_info_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/mutual_info_score.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <random>
@@ -126,7 +126,7 @@ class mutualInfoTest : public ::testing::TestWithParam<mutualInfoParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   mutualInfoParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements             = 0;
diff --git a/cpp/test/stats/r2_score.cu b/cpp/test/stats/r2_score.cu
index 7fb15505ab..26a1920aae 100644
--- a/cpp/test/stats/r2_score.cu
+++ b/cpp/test/stats/r2_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,7 +84,7 @@ class R2_scoreTest : public ::testing::TestWithParam<R2_scoreInputs<T>> {
 
  protected:
   R2_scoreInputs<T> params;
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream = 0;
   T expectedVal, actualVal;
 };
diff --git a/cpp/test/stats/rand_index.cu b/cpp/test/stats/rand_index.cu
index 0010f3cbcd..10a31a27ca 100644
--- a/cpp/test/stats/rand_index.cu
+++ b/cpp/test/stats/rand_index.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 #include <iostream>
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/stats/rand_index.cuh>
 #include <random>
 
@@ -98,7 +98,7 @@ class randIndexTest : public ::testing::TestWithParam<randIndexParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   randIndexParam params;
   int lowerLabelRange = 0, upperLabelRange = 2;
   uint64_t size            = 0;
diff --git a/cpp/test/stats/regression_metrics.cu b/cpp/test/stats/regression_metrics.cu
index 86ac03c8b3..9a8e4af6a4 100644
--- a/cpp/test/stats/regression_metrics.cu
+++ b/cpp/test/stats/regression_metrics.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,8 +106,8 @@ class RegressionTest : public ::testing::TestWithParam<RegressionInputs<T>> {
   }
 
  protected:
+  raft::device_resources handle;
   RegressionInputs<T> params;
-  raft::handle_t handle;
   cudaStream_t stream           = 0;
   double mean_abs_error         = 0;
   double mean_squared_error     = 0;
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 876926b71a..80e60a4884 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -192,6 +192,7 @@ class silhouetteScoreTest : public ::testing::TestWithParam<silhouetteScoreParam
   }
 
   // declaring the data values
+  raft::device_resources handle;
   silhouetteScoreParam params;
   int nLabels;
   rmm::device_uvector<DataT> d_X;
@@ -203,7 +204,6 @@ class silhouetteScoreTest : public ::testing::TestWithParam<silhouetteScoreParam
   double truthSilhouetteScore    = 0;
   double computedSilhouetteScore = 0;
   double batchedSilhouetteScore  = 0;
-  raft::handle_t handle;
   int chunk;
 };
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 7f54eee2ab..dfc31f31d2 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -114,7 +114,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   StdDevInputs<T> params;
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index c69bd04c6e..f6b6ffcc45 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include "../test_utils.cuh"
 
-#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/stats/sum.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -72,7 +72,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   cudaStream_t stream;
 
   SumInputs<T> params;
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index a95cddf5aa..a2f72516eb 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,9 @@ namespace raft {
 namespace stats {
 
 class TrustworthinessScoreTest : public ::testing::Test {
+ public:
+  TrustworthinessScoreTest() : d_X(0, handle.get_stream()), d_X_embedded(0, handle.get_stream()) {}
+
  protected:
   void basicTest()
   {
@@ -311,13 +314,10 @@ class TrustworthinessScoreTest : public ::testing::Test {
       -0.02323332, 0.04292452,  0.39291084,  -0.94897962, -0.63863206, -0.16546988, 0.23698957,
       -0.30633628};
 
-    raft::handle_t handle;
-
-    cudaStream_t stream = handle.get_stream();
-
-    rmm::device_uvector<float> d_X(X.size(), stream);
-    rmm::device_uvector<float> d_X_embedded(X_embedded.size(), stream);
+    auto stream = handle.get_stream();
 
+    d_X.resize(X.size(), stream);
+    d_X_embedded.resize(X_embedded.size(), stream);
     raft::update_device(d_X.data(), X.data(), X.size(), stream);
     raft::update_device(d_X_embedded.data(), X_embedded.data(), X_embedded.size(), stream);
     auto n_sample            = 50;
@@ -338,6 +338,11 @@ class TrustworthinessScoreTest : public ::testing::Test {
   void TearDown() override {}
 
  protected:
+  raft::device_resources handle;
+
+  rmm::device_uvector<float> d_X;
+  rmm::device_uvector<float> d_X_embedded;
+
   double score;
 };
 
diff --git a/cpp/test/stats/v_measure.cu b/cpp/test/stats/v_measure.cu
index 79899c1d75..9d1522a5c8 100644
--- a/cpp/test/stats/v_measure.cu
+++ b/cpp/test/stats/v_measure.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,7 +103,7 @@ class vMeasureTest : public ::testing::TestWithParam<vMeasureParam> {
   }
 
   // declaring the data values
-  raft::handle_t handle;
+  raft::device_resources handle;
   vMeasureParam params;
   T lowerLabelRange, upperLabelRange;
   int nElements           = 0;
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index 8b4af07898..7e28ca9aa3 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   WeightedMeanInputs<T> params;
   thrust::host_vector<T> hin, hweights;
   thrust::device_vector<T> din, dweights, dexp, dact;
@@ -186,7 +186,7 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   WeightedMeanInputs<T> params;
   thrust::host_vector<T> hin, hweights;
   thrust::device_vector<T> din, dweights, dexp, dact;
@@ -244,7 +244,7 @@ class WeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>>
   }
 
  protected:
-  raft::handle_t handle;
+  raft::device_resources handle;
   WeightedMeanInputs<T> params;
   thrust::host_vector<T> hin, hweights;
   thrust::device_vector<T> din, dweights, dexp, dact;
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
new file mode 100644
index 0000000000..f45e8ce1e0
--- /dev/null
+++ b/cpp/test/util/bitonic_sort.cu
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/util/bitonic_sort.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::util {
+
+constexpr int kMaxBlockSize = 512;
+constexpr int kMaxCapacity  = 128;
+
+struct test_spec {
+  int n_inputs;
+  int warp_width;
+  int capacity;
+  bool ascending;
+
+  [[nodiscard]] auto len() const -> int { return n_inputs * warp_width * capacity; }
+};
+
+auto operator<<(std::ostream& os, const test_spec& ss) -> std::ostream&
+{
+  os << "spec{n_inputs: " << ss.n_inputs << ", input_len: " << (ss.warp_width * ss.capacity) << " ("
+     << ss.warp_width << " * " << ss.capacity << ")";
+  os << (ss.ascending ? "; asc}" : "; dsc}");
+  return os;
+}
+
+template <int Capacity, typename T>
+__global__ void bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
+{
+  const int tid          = blockDim.x * blockIdx.x + threadIdx.x;
+  const int subwarp_id   = tid / warp_width;
+  const int subwarp_lane = tid % warp_width;
+  T local_arr[Capacity];  // NOLINT
+  // Split the data into chunks of size `warp_width * Capacity`, each thread poiting
+  // to the beginning of its stride within the chunk.
+  T* per_thread_arr = arr + subwarp_id * warp_width * Capacity + subwarp_lane;
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      local_arr[i] = per_thread_arr[i * warp_width];
+    }
+  }
+
+  bitonic<Capacity>(ascending, warp_width).sort(local_arr);
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      per_thread_arr[i * warp_width] = local_arr[i];
+    }
+  }
+}
+
+template <int Capacity>
+struct bitonic_launch {
+  template <typename T>
+  static void run(const test_spec& spec, T* arr, rmm::cuda_stream_view stream)
+  {
+    ASSERT(spec.capacity <= Capacity, "Invalid input: the requested capacity is too high.");
+    ASSERT(spec.warp_width <= WarpSize,
+           "Invalid input: the requested warp_width must be not larger than the WarpSize.");
+    if constexpr (Capacity > 1) {
+      if (spec.capacity < Capacity) {
+        return bitonic_launch<std::max(1, Capacity / 2)>::run(spec, arr, stream);
+      }
+    }
+    int max_block_size, min_grid_size;
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &max_block_size, bitonic_kernel<Capacity, T>, 0, kMaxBlockSize));
+    const int n_warps =
+      ceildiv(std::min(spec.n_inputs * spec.warp_width, max_block_size), WarpSize);
+    const int block_dim  = n_warps * WarpSize;
+    const int n_subwarps = block_dim / spec.warp_width;
+    const int grid_dim   = ceildiv(spec.n_inputs, n_subwarps);
+    bitonic_kernel<Capacity, T>
+      <<<grid_dim, block_dim, 0, stream>>>(arr, spec.ascending, spec.warp_width, spec.n_inputs);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+template <typename T>
+class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
+ protected:
+  const test_spec spec;  // NOLINT
+  std::vector<T> in;     // NOLINT
+  std::vector<T> out;    // NOLINT
+  std::vector<T> ref;    // NOLINT
+
+  void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
+  {
+    std::vector<int> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    std::sort(p.begin(), p.end(), [&vec, k, ascending](int i, int j) {
+      const int ik = i / k;
+      const int jk = j / k;
+      if (ik == jk) { return ascending ? vec[i] < vec[j] : vec[i] > vec[j]; }
+      return ik < jk;
+    });
+    for (auto i = int(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+
+  void fill_random(rmm::device_uvector<T>& arr, rmm::cuda_stream_view stream)
+  {
+    raft::random::Rng rng(42);
+    if constexpr (std::is_floating_point_v<T>) {
+      return rng.normal(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+    if constexpr (std::is_integral_v<T>) {
+      return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+  }
+
+ public:
+  explicit BitonicTest()
+    : spec(testing::TestWithParam<test_spec>::GetParam()),
+      in(spec.len()),
+      out(spec.len()),
+      ref(spec.len())
+  {
+    auto stream = rmm::cuda_stream_default;
+
+    // generate input
+    rmm::device_uvector<T> arr_d(spec.len(), stream);
+    fill_random(arr_d, stream);
+    update_host(in.data(), arr_d.data(), arr_d.size(), stream);
+
+    // calculate the results
+    bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+    update_host(out.data(), arr_d.data(), arr_d.size(), stream);
+
+    // make sure the results are available on host
+    stream.synchronize();
+
+    // calculate the reference
+    std::copy(in.begin(), in.end(), ref.begin());
+    segmented_sort(ref, spec.warp_width * spec.capacity, spec.ascending);
+  }
+
+  void run() { ASSERT_TRUE(hostVecMatch(ref, out, Compare<T>())); }
+};
+
+auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
+                                test_spec{1, 2, 1, true},
+                                test_spec{1, 4, 1, true},
+                                test_spec{1, 8, 1, true},
+                                test_spec{1, 16, 1, false},
+                                test_spec{1, 32, 1, false},
+                                test_spec{1, 32, 2, false},
+                                test_spec{1, 32, 4, true},
+                                test_spec{1, 32, 8, true},
+                                test_spec{5, 32, 2, true},
+                                test_spec{7, 16, 4, true},
+                                test_spec{7, 8, 2, true},
+                                test_spec{70, 4, 32, true},
+                                test_spec{70, 1, 64, true},
+                                test_spec{70, 2, 128, false});
+
+using Floats = BitonicTest<float>;                     // NOLINT
+TEST_P(Floats, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);  // NOLINT
+
+using Ints = BitonicTest<int>;                       // NOLINT
+TEST_P(Ints, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);  // NOLINT
+
+using Doubles = BitonicTest<double>;                    // NOLINT
+TEST_P(Doubles, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Doubles, inputs);  // NOLINT
+
+}  // namespace raft::util
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/util/cudart_utils.cpp
similarity index 98%
rename from cpp/test/cudart_utils.cpp
rename to cpp/test/util/cudart_utils.cpp
index 7e8585c7c7..e6b1aa9676 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/util/cudart_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/device_atomics.cu b/cpp/test/util/device_atomics.cu
similarity index 97%
rename from cpp/test/device_atomics.cu
rename to cpp/test/util/device_atomics.cu
index 4e56b8d486..5e8a67c8f6 100644
--- a/cpp/test/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/util/integer_utils.cpp
similarity index 96%
rename from cpp/test/integer_utils.cpp
rename to cpp/test/util/integer_utils.cpp
index 46fa8d348d..ed5dddf72d 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/util/integer_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/util/pow2_utils.cu
similarity index 98%
rename from cpp/test/pow2_utils.cu
rename to cpp/test/util/pow2_utils.cu
index 9e9bd80673..e29e4eeb9c 100644
--- a/cpp/test/pow2_utils.cu
+++ b/cpp/test/util/pow2_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/dependencies.yaml b/dependencies.yaml
index 52054d9c7d..c4fad0e92d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.5"]
+      cuda: ["11.8"]
       arch: [x86_64]
     includes:
       - build
@@ -53,12 +53,12 @@ dependencies:
           - matrix:
               arch: x86_64
             packages:
-              - gcc_linux-64=9.*
+              - gcc_linux-64=9
               - sysroot_linux-64==2.17
           - matrix:
               arch: aarch64
             packages:
-              - gcc_linux-aarch64=9.*
+              - gcc_linux-aarch64=9
               - sysroot_linux-aarch64==2.17
   checks:
     common:
@@ -77,6 +77,19 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cudatoolkit=11.8
+              - cuda-profiler-api=11.8.86
+              - libcublas-dev=11.11.3.6
+              - libcublas=11.11.3.6
+              - libcurand-dev=10.3.0.86
+              - libcurand=10.3.0.86
+              - libcusolver-dev=11.4.1.48
+              - libcusolver=11.4.1.48
+              - libcusparse-dev=11.7.5.86
+              - libcusparse=11.7.5.86
           - matrix:
               cuda: "11.5"
             packages:
@@ -142,22 +155,26 @@ dependencies:
               py: "3.9"
             packages:
               - python=3.9
+          - matrix:
+              py: "3.10"
+            packages:
+              - python=3.10
           - matrix:
             packages:
-              - python>=3.8,<3.10
+              - python>=3.8,<3.11
   run:
     common:
       - output_types: [conda]
         packages:
-          - rmm=23.02.*
+          - rmm=23.04
           - dask>=2022.12.0
           - distributed>=2022.12.0
           - ucx>=1.13.0
-          - ucx-py=0.30.*
+          - ucx-py=0.31.*
           - ucx-proc=*=gpu
-          - libfaiss>=1.7.0=cuda*
+          - libfaiss>=1.7.1=cuda*
           - faiss-proc=*=cuda
-          - dask-cuda=23.02.*
+          - dask-cuda=23.04
   test_python:
     common:
       - output_types: [conda, requirements]
diff --git a/docs/source/build.md b/docs/source/build.md
index 2eba3af450..4052e49cf8 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -130,7 +130,7 @@ For example, to run the distance tests:
 It can take sometime to compile all of the tests. You can build individual tests by providing a semicolon-separated list to the `--limit-tests` option in `build.sh`:
 
 ```bash
-./build.sh libraft tests --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST
+./build.sh libraft tests -n --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST
 ```
 
 ### Benchmarks
@@ -143,7 +143,7 @@ The benchmarks are broken apart by algorithm category, so you will find several
 It can take sometime to compile all of the benchmarks. You can build individual benchmarks by providing a semicolon-separated list to the `--limit-bench` option in `build.sh`:
 
 ```bash
-./build.sh libraft bench --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH
+./build.sh libraft bench -n --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH
 ```
 
 ### C++ Using Cmake Directly
@@ -180,10 +180,10 @@ Currently, shared libraries are provided for the `libraft-nn` and `libraft-dista
 
 ### Python
 
-Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
+Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.8 conda environment:
 
 ```bash
-mamba env create --name raft_env_name -f conda/environments/all_cuda-115_arch-x86_64.yaml
+mamba env create --name raft_env_name -f conda/environments/all_cuda-118_arch-x86_64.yaml
 mamba activate raft_env_name
 ```
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4f78ae2145..a85dc15b3b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,17 +77,17 @@
 
 # General information about the project.
 project = "raft"
-copyright = "2022, nvidia"
-author = "nvidia"
+copyright = "2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '23.02'
+version = '23.04'
 # The full version, including alpha/beta/rc tags.
-release = '23.02.00'
+release = '23.04.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -161,7 +161,7 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "raft.tex", "RAFT Documentation", "nvidia", "manual"),
+    (master_doc, "raft.tex", "RAFT Documentation", "NVIDIA Corporation", "manual"),
 ]
 
 # -- Options for manual page output ---------------------------------------
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 98365d6485..c4728337a0 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -14,7 +14,7 @@ expose in public APIs.
    :maxdepth: 2
    :caption: Contents:
 
-   core_handle.rst
+   core_resources.rst
    core_logger.rst
    core_kvp.rst
    core_nvtx.rst
diff --git a/docs/source/cpp_api/core_handle.rst b/docs/source/cpp_api/core_handle.rst
deleted file mode 100644
index 58fc80681e..0000000000
--- a/docs/source/cpp_api/core_handle.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-handle_t
-========
-
-.. role:: py(code)
-   :language: c++
-   :class: highlight
-
-
-``#include <raft/core/handle.hpp>``
-
-namespace *raft::core*
-
-.. doxygenclass:: raft::handle_t
-    :project: RAFT
-    :members:
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
new file mode 100644
index 0000000000..b148e38e44
--- /dev/null
+++ b/docs/source/cpp_api/core_resources.rst
@@ -0,0 +1,183 @@
+Resources
+=========
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+All resources which are specific to a computing environment like host or device are contained within, and managed by,
+raft::resources. This design simplifies the APIs and eases user burden by making them opaque by default but allowing customization based on user preference.
+
+
+Vocabulary
+----------
+
+``#include <raft/core/resource/resource_types.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_types
+     :project: RAFT
+     :members:
+     :content-only:
+
+
+Device Resources
+----------------
+
+``#include <raft/core/device_resources.hpp>``
+
+namespace *raft::core*
+
+.. doxygenclass:: raft::device_resources
+    :project: RAFT
+    :members:
+
+
+Resource Functions
+------------------
+
+Comms
+~~~~~
+
+``#include <raft/core/resource/comms.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_comms
+     :project: RAFT
+     :members:
+     :content-only:
+
+cuBLAS Handle
+~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cublase_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cublas
+     :project: RAFT
+     :members:
+     :content-only:
+
+CUDA Stream
+~~~~~~~~~~~
+
+``#include <raft/core/resource/cuda_stream.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cuda_stream
+     :project: RAFT
+     :members:
+     :content-only:
+
+
+CUDA Stream Pool
+~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cuda_stream_pool.hpp>``
+
+namespace *raft::resource*
+
+.. doxygengroup:: resource_cuda_stream_pool
+    :project: RAFT
+    :members:
+    :content-only:
+
+cuSolverDn Handle
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cusolver_dn_handle.hpp>``
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cusolver_dn
+     :project: RAFT
+     :members:
+     :content-only:
+
+cuSolverSp Handle
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cusolver_sp_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cusolver_sp
+     :project: RAFT
+     :members:
+     :content-only:
+
+cuSparse Handle
+~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/cusparse_handle.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_cusparse
+     :project: RAFT
+     :members:
+     :content-only:
+
+Device ID
+~~~~~~~~~
+
+``#include <raft/core/resource/device_id.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_device_id
+     :project: RAFT
+     :members:
+     :content-only:
+
+
+Device Memory Resource
+~~~~~~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/device_memory_resource.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_memory_resource
+     :project: RAFT
+     :members:
+     :content-only:
+
+Device Properties
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/device_properties.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_device_props
+     :project: RAFT
+     :members:
+     :content-only:
+
+Sub Communicators
+~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/sub_comms.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_sub_comms
+     :project: RAFT
+     :members:
+     :content-only:
+
+Thrust Exec Policy
+~~~~~~~~~~~~~~~~~~
+
+``#include <raft/core/resource/thrust_policy.hpp>``
+
+namespace *raft::resource*
+
+ .. doxygengroup:: resource_thrust_policy
+     :project: RAFT
+     :members:
+     :content-only:
diff --git a/docs/source/cpp_api/distance.rst b/docs/source/cpp_api/distance.rst
index eb9bc6255d..1632f19fba 100644
--- a/docs/source/cpp_api/distance.rst
+++ b/docs/source/cpp_api/distance.rst
@@ -25,3 +25,4 @@ namespace *raft::distance*
 
    distance_pairwise.rst
    distance_1nn.rst
+   distance_masked_nn.rst
diff --git a/docs/source/cpp_api/distance_masked_nn.rst b/docs/source/cpp_api/distance_masked_nn.rst
new file mode 100644
index 0000000000..89e23ba98a
--- /dev/null
+++ b/docs/source/cpp_api/distance_masked_nn.rst
@@ -0,0 +1,16 @@
+Masked 1-Nearest Neighbors
+==========================
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <raft/distance/masked_nn.cuh>``
+
+namespace *raft::distance*
+
+.. doxygengroup:: masked_nn
+    :project: RAFT
+    :members:
+    :content-only:
+
diff --git a/docs/source/cpp_api/matrix_ordering.rst b/docs/source/cpp_api/matrix_ordering.rst
index fae6dc12a4..0af84e14f5 100644
--- a/docs/source/cpp_api/matrix_ordering.rst
+++ b/docs/source/cpp_api/matrix_ordering.rst
@@ -29,6 +29,18 @@ namespace *raft::matrix*
     :members:
     :content-only:
 
+Select-K
+--------
+
+``#include <raft/matrix/select_k.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: select_k
+    :project: RAFT
+    :members:
+    :content-only:
+
 Column-wise Sort
 ----------------
 
diff --git a/docs/source/cpp_api/mdspan_mdspan.rst b/docs/source/cpp_api/mdspan_mdspan.rst
index 272a724833..619150f538 100644
--- a/docs/source/cpp_api/mdspan_mdspan.rst
+++ b/docs/source/cpp_api/mdspan_mdspan.rst
@@ -22,6 +22,9 @@ mdspan: Multi-dimensional Non-owning View
 .. doxygenfunction:: raft::unravel_index
     :project: RAFT
 
+.. doxygenfunction:: raft::make_const_mdspan(mdspan_type mds)
+    :project: RAFT
+
 
 Device Vocabulary
 -----------------
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index b37d5dc1af..2f54753cc6 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -1,5 +1,13 @@
 # Developer Guide
 
+## General
+Please start by reading the [Contributor Guide](contributing.md).
+
+## Performance
+1. In performance critical sections of the code, favor `cudaDeviceGetAttribute` over `cudaDeviceGetProperties`. See corresponding CUDA devblog [here](https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/) to know more.
+2. If an algo requires you to launch GPU work in multiple cuda streams, do not create multiple `raft::resources` objects, one for each such work stream. Instead, use the stream pool configured on the given `raft::resources` instance's `raft::resources::get_stream_from_stream_pool()` to pick up the right cuda stream. Refer to the section on [CUDA Resources](#resource-management) and the section on [Threading](#threading-model) for more details. TIP: use `raft::resources::get_stream_pool_size()` to know how many such streams are available at your disposal.
+
+
 ## Local Development
 
 Developing features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts.
@@ -8,11 +16,239 @@ The process for working on a CUDA/C++ feature which might span RAFT and one or m
 
 If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
 
-## API stability
+
+## Threading Model
+
+With the exception of the `raft::resources`, RAFT algorithms should maintain thread-safety and are, in general,
+assumed to be single threaded. This means they should be able to be called from multiple host threads so
+long as different instances of `raft::resources` are used.
+
+Exceptions are made for algorithms that can take advantage of multiple CUDA streams within multiple host threads
+in order to oversubscribe or increase occupancy on a single GPU. In these cases, the use of multiple host
+threads within RAFT algorithms should be used only to maintain concurrency of the underlying CUDA streams.
+Multiple host threads should be used sparingly, be bounded, and should steer clear of performing CPU-intensive
+computations.
+
+A good example of an acceptable use of host threads within a RAFT algorithm might look like the following
+
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+raft::resources res;
+
+...
+
+sync_stream(res);
+
+...
+
+int n_streams = get_stream_pool_size(res);
+
+#pragma omp parallel for num_threads(n_threads)
+for(int i = 0; i < n; i++) {
+    int thread_num = omp_get_thread_num() % n_threads;
+    cudaStream_t s = get_stream_from_stream_pool(res, thread_num);
+    ... possible light cpu pre-processing ...
+    my_kernel1<<<b, tpb, 0, s>>>(...);
+    ...
+    ... some possible async d2h / h2d copies ...
+    my_kernel2<<<b, tpb, 0, s>>>(...);
+    ...
+    sync_stream(res, s);
+    ... possible light cpu post-processing ...
+}
+```
+
+In the example above, if there is no CPU pre-processing at the beginning of the for-loop, an event can be registered in
+each of the streams within the for-loop to make them wait on the stream from the handle. If there is no CPU post-processing
+at the end of each for-loop iteration, `sync_stream(res, s)` can be replaced with a single `sync_stream_pool(res)`
+after the for-loop.
+
+To avoid compatibility issues between different threading models, the only threading programming allowed in RAFT is OpenMP.
+Though RAFT's build enables OpenMP by default, RAFT algorithms should still function properly even when OpenMP has been
+disabled. If the CPU pre- and post-processing were not needed in the example above, OpenMP would not be needed.
+
+The use of threads in third-party libraries is allowed, though they should still avoid depending on a specific OpenMP runtime.
+
+## Public Interface
+
+### General guidelines
+Functions exposed via the C++ API must be stateless. Things that are OK to be exposed on the interface:
+1. Any [POD](https://en.wikipedia.org/wiki/Passive_data_structure) - see [std::is_pod](https://en.cppreference.com/w/cpp/types/is_pod) as a reference for C++11  POD types.
+2. `raft::resources` - since it stores resource-related state which has nothing to do with model/algo state.
+3. Avoid using pointers to POD types (explicitly putting it out, even though it can be considered as a POD) and pass the structures by reference instead.
+   Internal to the C++ API, these stateless functions are free to use their own temporary classes, as long as they are not exposed on the interface.
+4. Accept single- (`raft::span`) and multi-dimensional views (`raft::mdspan`) and validate their metadata wherever possible.
+5. Prefer `std::optional` for any optional arguments (e.g. do not accept `nullptr`)
+6. All public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace.
+
+### API stability
 
 Since RAFT is a core library with multiple consumers, it's important that the public APIs maintain stability across versions and any changes to them are done with caution, adding new functions and deprecating the old functions over a couple releases as necessary.
 
-The public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace. 
+### Stateless C++ APIs
+
+Using the IVF-PQ algorithm as an example, the following way of exposing its API would be wrong according to the guidelines in this section, since it exposes a non-POD C++ class object in the C++ API:
+```cpp
+template <typename value_t, typename idx_t>
+class ivf_pq {
+  ivf_pq_params params_;
+  raft::resources const& res_;
+  
+public:
+  ivf_pq(raft::resources const& res);
+  void train(raft::device_matrix<value_t, idx_t, raft::row_major> dataset);
+  void search(raft::device_matrix<value_t, idx_t, raft::row_major> queries, 
+              raft::device_matrix<value_t, idx_t, raft::row_major> out_inds, 
+              raft::device_matrix<value_t, idx_t, raft::row_major> out_dists);
+};
+```
+
+An alternative correct way to expose this could be:
+```cpp
+namespace raft::ivf_pq {
+
+template<typename value_t, typename value_idx>
+void ivf_pq_train(raft::resources const& res, const raft::ivf_pq_params &params, raft::ivf_pq_index &index,
+raft::device_matrix<value_t, idx_t, raft::row_major> dataset);
+
+template<typename value_t, typename value_idx>
+void ivf_pq_search(raft::resources const& res, raft::ivf_pq_params const&params, raft::ivf_pq_index const & index,
+raft::device_matrix<value_t, idx_t, raft::row_major> queries,
+raft::device_matrix<value_t, idx_t, raft::row_major> out_inds,
+raft::device_matrix<value_t, idx_t, raft::row_major> out_dists);
+}
+```
+
+### Other functions on state
+
+These guidelines also mean that it is the responsibility of C++ API to expose methods to load and store (aka marshalling) such a data structure. Further continuing the IVF-PQ example,  the following methods could achieve this:
+```cpp
+namespace raft::ivf_pq {
+   void save(raft::ivf_pq_index const& model, std::ostream &os);
+   void load(raft::ivf_pq_index& model, std::istream &is);
+}
+```
+
+
+## Coding style
+
+### Code format
+#### Introduction
+RAFT relies on `clang-format` to enforce code style across all C++ and CUDA source code. The coding style is based on the [Google style guide](https://google.github.io/styleguide/cppguide.html#Formatting). The only digressions from this style are the following.
+1. Do not split empty functions/records/namespaces.
+2. Two-space indentation everywhere, including the line continuations.
+3. Disable reflowing of comments.
+   The reasons behind these deviations from the Google style guide are given in comments [here](../../cpp/.clang-format).
+
+#### How is the check done?
+All formatting checks are done by this python script: [run-clang-format.py](../../cpp/scripts/run-clang-format.py) which is effectively a wrapper over `clang-format`. An error is raised if the code diverges from the format suggested by clang-format. It is expected that the developers run this script to detect and fix formatting violations before creating PR.
+
+##### As part of CI
+[run-clang-format.py](../../cpp/scripts/run-clang-format.py) is executed as part of our `ci/checks/style.sh` CI test. If there are any formatting violations, PR author is expected to fix those to get CI passing. Steps needed to fix the formatting violations are described in the subsequent sub-section.
+
+##### Manually
+Developers can also manually (or setup this command as part of git pre-commit hook) run this check by executing:
+```bash
+python ./cpp/scripts/run-clang-format.py
+```
+From the root of the RAFT repository.
+
+#### How to know the formatting violations?
+When there are formatting errors, [run-clang-format.py](../../cpp/scripts/run-clang-format.py) prints a `diff` command, showing where there are formatting differences. Unfortunately, unlike `flake8`, `clang-format` does NOT print descriptions of the violations, but instead directly formats the code. So, the only way currently to know about formatting differences is to run the diff command as suggested by this script against each violating source file.
+
+#### How to fix the formatting violations?
+When there are formatting violations, [run-clang-format.py](../../cpp/scripts/run-clang-format.py) prints at the end, the exact command that can be run by developers to fix them. This is the easiest way to fix formatting errors. [This screencast](https://asciinema.org/a/287367) shows how developers can check for formatting violations in their branches and also how to fix those, before sending out PRs.
+
+In short, to bulk-fix all the formatting violations, execute the following command:
+```bash
+python ./cpp/scripts/run-clang-format.py -inplace
+```
+From the root of the RAFT repository.
+
+#### clang-format version?
+To avoid spurious code style violations we specify the exact clang-format version required, currently `11.1.0`. This is enforced by the [run-clang-format.py](../../cpp/scripts/run-clang-format.py) script itself. Refer [here](../../cpp/README.md#dependencies) for the list of build-time dependencies.
+
+#### Additional scripts
+Along with clang, there are an include checker and copyright checker scripts for checking style, which can be performed as part of CI, as well as manually.
+
+##### #include style
+[include_checker.py](../../cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
+2. `#include <...>` should be used for referencing everything else
+
+Manually, run the following to bulk-fix include style issues:
+```bash
+python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list of folders which you want to fix]
+```
+
+##### Copyright header
+[copyright.py](../../ci/checks/copyright.py) checks the Copyright header for all git-modified files
+
+Manually, you can run the following to bulk-fix the header if only the years need to be updated:
+```bash
+python ./ci/checks/copyright.py --update-current-year
+```
+Keep in mind that this only applies to files tracked by git and having been modified.
+
+## Error handling
+Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` and `RAFT_CUSOLVER_TRY`. These macros take care of checking the return values of the used API calls and generate an exception when the command is not successful. If you need to avoid an exception, e.g. inside a destructor, use `RAFT_CUDA_TRY_NO_THROW`, `RAFT_CUBLAS_TRY_NO_THROW ` and `RAFT_CUSOLVER_TRY_NO_THROW`. These macros log the error but do not throw an exception.
+
+## Logging
+
+### Introduction
+Anything and everything about logging is defined inside [logger.hpp](../../cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+
+### Usage
+```cpp
+#include <raft/core/logger.hpp>
+
+// Inside your method or function, use any of these macros
+RAFT_LOG_TRACE("Hello %s!", "world");
+RAFT_LOG_DEBUG("Hello %s!", "world");
+RAFT_LOG_INFO("Hello %s!", "world");
+RAFT_LOG_WARN("Hello %s!", "world");
+RAFT_LOG_ERROR("Hello %s!", "world");
+RAFT_LOG_CRITICAL("Hello %s!", "world");
+```
+
+### Changing logging level
+There are 7 logging levels with each successive level becoming quieter:
+1. RAFT_LEVEL_TRACE
+2. RAFT_LEVEL_DEBUG
+3. RAFT_LEVEL_INFO
+4. RAFT_LEVEL_WARN
+5. RAFT_LEVEL_ERROR
+6. RAFT_LEVEL_CRITICAL
+7. RAFT_LEVEL_OFF
+   Pass one of these as per your needs into the `set_level()` method as follows:
+```cpp
+raft::logger::get.set_level(RAFT_LEVEL_WARN);
+// From now onwards, this will print only WARN and above kind of messages
+```
+
+### Changing logging pattern
+Pass the [format string](https://github.com/gabime/spdlog/wiki/3.-Custom-formatting) as follows in order use a different logging pattern than the default.
+```cpp
+raft::logger::get.set_pattern(YourFavoriteFormat);
+```
+One can also use the corresponding `get_pattern()` method to know the current format as well.
+
+### Temporarily changing the logging pattern
+Sometimes, we need to temporarily change the log pattern (eg: for reporting decision tree structure). This can be achieved in a RAII-like approach as follows:
+```cpp
+{
+  PatternSetter _(MyNewTempFormat);
+  // new log format is in effect from here onwards
+  doStuff();
+  // once the above temporary object goes out-of-scope, the old format will be restored
+}
+```
+
+### Tips
+* Do NOT end your logging messages with a newline! It is automatically added by spdlog.
+* The `RAFT_LOG_TRACE()` is by default not compiled due to the `RAFT_ACTIVE_LEVEL` macro setup, for performance reasons. If you need it to be enabled, change this macro accordingly during compilation time
 
 ## Common Design Considerations
 
@@ -26,9 +262,170 @@ The public APIs should be lightweight wrappers around calls to private APIs insi
 
 ## Testing
 
-It's important for RAFT to maintain a high test coverage in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects.
+It's important for RAFT to maintain a high test coverage of the public APIs in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. 
 
+A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects.
 
 ## Documentation
 
-Public APIs always require documentation, since those will be exposed directly to users. In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
+Public APIs always require documentation since those will be exposed directly to users. For C++, we use [doxygen](http://www.doxygen.nl) and for Python/cython we use [pydoc](https://docs.python.org/3/library/pydoc.html). In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
+
+## Asynchronous operations and stream ordering
+All RAFT algorithms should be as asynchronous as possible avoiding the use of the default stream (aka as NULL or `0` stream). Implementations that require only one CUDA Stream should use the stream from `raft::resources`:
+
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+void foo(const raft::resources& res, ...)
+{
+    cudaStream_t stream = get_cuda_stream(res);
+}
+```
+When multiple streams are needed, e.g. to manage a pipeline, use the internal streams available in `raft::resources` (see [CUDA Resources](#cuda-resources)). If multiple streams are used all operations still must be ordered according to `raft::resource::get_cuda_stream()` (from `raft/core/resource/cuda_stream.hpp`). Before any operation in any of the internal CUDA streams is started, all previous work in `raft::resource::get_cuda_stream()` must have completed. Any work enqueued in `raft::resource::get_cuda_stream()` after a RAFT function returns should not start before all work enqueued in the internal streams has completed. E.g. if a RAFT algorithm is called like this:
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+void foo(const double* srcdata, double* result)
+{
+    cudaStream_t stream;
+    CUDA_RT_CALL( cudaStreamCreate( &stream ) );
+    raft::resources res;
+    set_cuda_stream(res, stream);
+    
+    ...
+    
+    RAFT_CUDA_TRY( cudaMemcpyAsync( srcdata, h_srcdata.data(), n*sizeof(double), cudaMemcpyHostToDevice, stream ) );
+
+    raft::algo(raft::resources, dopredict, srcdata, result, ... );
+
+    RAFT_CUDA_TRY( cudaMemcpyAsync( h_result.data(), result, m*sizeof(int), cudaMemcpyDeviceToHost, stream ) );
+
+    ...
+}
+```
+No work in any stream should start in `raft::algo` before the `cudaMemcpyAsync` in `stream` launched before the call to `raft::algo` is done. And all work in all streams used in `raft::algo` should be done before the `cudaMemcpyAsync` in `stream` launched after the call to `raft::algo` starts.
+
+This can be ensured by introducing interstream dependencies with CUDA events and `cudaStreamWaitEvent`. For convenience, the header `raft/core/device_resources.hpp` provides the class `raft::stream_syncer` which lets all `raft::resources` internal CUDA streams wait on `raft::resource::get_cuda_stream()` in its constructor and in its destructor and lets `raft::resource::get_cuda_stream()` wait on all work enqueued in the `raft::resources` internal CUDA streams. The intended use would be to create a `raft::stream_syncer` object as the first thing in an entry function of the public RAFT API:
+
+```cpp
+namespace raft {
+   void algo(const raft::resources& res, ...)
+   {
+       raft::streamSyncer _(res);
+   }
+}
+```
+This ensures the stream ordering behavior described above.
+
+### Using Thrust
+To ensure that thrust algorithms are executed in the intended stream the `thrust::cuda::par` execution policy should be used. To ensure that thrust algorithms allocate temporary memory via the provided device memory allocator, use the `rmm::exec_policy` available in `raft/core/resource/thrust_policy.hpp`, which can be used through `raft::resources`:
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+void foo(const raft::resources& res, ...)
+{
+    auto execution_policy = get_thrust_policy(res);
+    thrust::for_each(execution_policy, ... );
+}
+```
+
+## Resource Management
+
+Do not create reusable CUDA resources directly in implementations of RAFT algorithms. Instead, use the existing resources in `raft::resources` to avoid constant creation and deletion of reusable resources such as CUDA streams, CUDA events or library handles. Please file a feature request if a resource handle is missing in `raft::resources`.
+The resources can be obtained like this
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+void foo(const raft::resources& h, ...)
+{
+    cublasHandle_t cublasHandle = get_cublas_handle(h);
+    const int num_streams       = get_stream_pool_size(h);
+    const int stream_idx        = ...
+    cudaStream_t stream         = get_stream_from_stream_pool(stream_idx);
+    ...
+}
+```
+
+The example below shows one way to create `n_stream` number of internal cuda streams with an `rmm::stream_pool` which can later be used by the algos inside RAFT. 
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+int main(int argc, char** argv)
+{
+    int n_streams = argc > 1 ? atoi(argv[1]) : 0;
+    raft::resources res;
+    set_cuda_stream_pool(res, std::make_shared<rmm::cuda_stream_pool>(n_streams));
+
+    foo(res, ...);
+}
+```
+
+## Multi-GPU
+
+The multi-GPU paradigm of RAFT is **O**ne **P**rocess per **G**PU (OPG). Each algorithm should be implemented in a way that it can run with a single GPU without any specific dependencies to a particular communication library. A multi-GPU implementation should use the methods offered by the class `raft::comms::comms_t` from [raft/core/comms.hpp] for inter-rank/GPU communication. It is the responsibility of the user of cuML to create an initialized instance of `raft::comms::comms_t`.
+
+E.g. with a CUDA-aware MPI, a RAFT user could use code like this to inject an initialized instance of `raft::comms::mpi_comms` into a `raft::resources`:
+
+```cpp
+#include <mpi.h>
+#include <raft/core/device_resources.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/algo.hpp>
+...
+int main(int argc, char * argv[])
+{
+    MPI_Init(&argc, &argv);
+    int rank = -1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int local_rank = -1;
+    {
+        MPI_Comm local_comm;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &local_comm);
+
+        MPI_Comm_rank(local_comm, &local_rank);
+
+        MPI_Comm_free(&local_comm);
+    }
+
+    cudaSetDevice(local_rank);
+
+    mpi_comms raft_mpi_comms;
+    MPI_Comm_dup(MPI_COMM_WORLD, &raft_mpi_comms);
+
+    {
+        raft::device_resources res;
+        initialize_mpi_comms(res, raft_mpi_comms);
+
+        ...
+
+        raft::algo(res, ... );
+    }
+
+    MPI_Comm_free(&raft_mpi_comms);
+
+    MPI_Finalize();
+    return 0;
+}
+```
+
+A RAFT developer can assume the following:
+* A instance of `raft::comms::comms_t` was correctly initialized.
+* All processes that are part of `raft::comms::comms_t` call into the RAFT algorithm cooperatively.
+
+The initialized instance of `raft::comms::comms_t` can be accessed from the `raft::resources` instance:
+
+```cpp
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/comms.hpp>
+void foo(const raft::resources& res, ...)
+{
+    const raft::comms_t& communicator = get_comms(res);
+    const int rank = communicator.get_rank();
+    const int size = communicator.get_size();
+    ...
+}
+```
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e66152b904..9890bd932f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,7 +23,7 @@ While not exhaustive, the following general categories help summarize the accele
    * - Dense Operations
      - linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems
    * - Sparse Operations
-     - linear algebra, arithmetic, eigenvalue problems, slicing, symmetrization, components & labeling
+     - linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling
    * - Spatial
      - pairwise distances, nearest neighbors, neighborhood graph construction
    * - Basic Clustering
@@ -45,6 +45,7 @@ While not exhaustive, the following general categories help summarize the accele
    cpp_api.rst
    pylibraft_api.rst
    raft_dask_api.rst
+   using_comms.rst
    contributing.md
 
 
diff --git a/docs/source/pylibraft_api/common.rst b/docs/source/pylibraft_api/common.rst
index 812543f868..527309aa69 100644
--- a/docs/source/pylibraft_api/common.rst
+++ b/docs/source/pylibraft_api/common.rst
@@ -12,7 +12,7 @@ This page provides `pylibraft` class references for the publicly-exposed element
 Basic Vocabulary
 ################
 
-.. autoclass:: pylibraft.common.Handle
+.. autoclass:: pylibraft.common.DeviceResources
     :members:
 
 .. autoclass:: pylibraft.common.Stream
diff --git a/docs/source/using_comms.rst b/docs/source/using_comms.rst
new file mode 100644
index 0000000000..84ea61c248
--- /dev/null
+++ b/docs/source/using_comms.rst
@@ -0,0 +1,97 @@
+Using RAFT Comms
+================
+
+RAFT provides a communications abstraction for writing distributed algorithms which can scale up to multiple GPUs and scale out to multiple nodes. The communications abstraction is largely based on MPI and NCCL, and allows the user to decouple the design of algorithms from the environments where the algorithms are executed, enabling “write-once deploy everywhere” semantics. Currently, the distributed algorithms in both cuGraph and cuML are being deployed in both MPI and Dask clusters while cuML’s distributed algorithms are also being deployed on GPUs in Apache Spark clusters. This is a powerful concept as distributed algorithms can be non-trivial to write and so maintainability is eased and bug fixes reach further by increasing reuse as much as possible.
+
+While users of RAFT’s communications layer largely get MPI integration for free just by installing MPI and using `mpirun` to run their applications, the `raft-dask` Python package provides a mechanism for executing algorithms written using RAFT’s communications layer in a Dask cluster. It will help to walk through a small example of how one would build an algorithm with RAFT’s communications layer.
+
+First, an instance of `raft::comms_t` is passed through the `raft::device_resources` instance and code is written to utilize collective and/or point-to-point communications as needed.
+
+.. code-block:: cpp
+   :caption: Example function written with the RAFT comms API
+
+   #include <raft/core/comms.hpp>
+   #include <raft/core/device_mdspan.hpp>
+   #include <raft/util/cudart_utils.hpp>
+
+   void test_allreduce(raft::device_resources const &handle, int root) {
+     raft::comms::comms_t const& communicator = handle.get_comms();
+     cudaStream_t stream = handle.get_stream();
+     raft::device_scalar<int> temp_scalar(stream);
+
+     int to_send = 1;
+     raft::copy(temp_scalar.data(), &to_send, 1, stream);
+     communicator.allreduce(temp_scalar.data(), temp_scalar.data(), 1,
+                            raft::comms::opt_t::SUM, stream);
+     handle.sync_stream();
+   }
+
+This exact function can now be executed in several different types of GPU clusters. For example, it can be executed with MPI by initializing an instance of `raft::comms::mpi_comms` with the `MPI_Comm`:
+
+.. code-block:: cpp
+   :caption: Example of running test_allreduce() in MPI
+
+   #include <raft/core/mpi_comms.hpp>
+   #include <raft/core/device_resources.hpp>
+
+   raft::device_resources resource_handle;
+   // ...
+   // initialize MPI_Comm
+   // ...
+   raft::comms::initialize_mpi_comms(resource_handle,  mpi_comm);
+   // ...
+   test_allreduce(resource_handle, 0);
+
+Deploying our`test_allreduce` function in Dask requires a lightweight Python interface, which we can accomplish using `pylibraft` and exposing the function through Cython:
+
+.. code-block:: cython
+   :caption: Example of wrapping test_allreduce() w/ cython
+
+   from pylibraft.common.handle cimport device_resources
+   from cython.operator cimport dereference as deref
+
+   cdef extern from “allreduce_test.hpp”:
+       void test_allreduce(device_resources const &handle, int root) except +
+
+   def run_test_allreduce(handle, root):
+       cdef const device_resources* h = <device_resources*><size_t>handle.getHandle()
+
+   test_allreduce(deref(h), root)
+
+Finally, we can use `raft_dask` to execute our new algorithm in a Dask cluster (please note this also uses `LocalCUDACluster` from the RAPIDS dask-cuda library):
+
+.. code-block:: python
+   :caption: Example of running test_allreduce() in Dask
+
+   from raft_dask.common import Comms, local_handle
+   from dask.distributed import Client, wait
+   from dask_cuda import LocalCUDACluster
+   cluster = LocalCUDACluster()
+   client = Client(cluster)
+
+   # Create and initialize Comms instance
+   comms = Comms(client=client)
+   comms.init()
+
+   def func_run_allreduce(sessionId, root):
+     handle = local_handle(sessionId)
+     run_test_allreduce(handle, root)
+
+   # Invoke run_test_allreduce on all workers
+   dfs = [
+     client.submit(
+       func_run_allreduce,
+       comms.sessionId,
+       0,
+       pure=False,
+       workers=[w]
+     )
+     for w in comms.worker_addresses
+   ]
+
+   # Wait until processing is done
+   wait(dfs, timeout=5)
+
+   comms.destroy()
+   client.close()
+   cluster.close()
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 248f5784c0..a6be017d77 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.04/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 3efc3a547b..98d723e27b 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(pylibraft_version 23.02.00)
+set(pylibraft_version 23.04.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/pylibraft/_custom_build/backend.py b/python/pylibraft/_custom_build/backend.py
index 7d1b334626..209e9e4b67 100644
--- a/python/pylibraft/_custom_build/backend.py
+++ b/python/pylibraft/_custom_build/backend.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 """Custom build backend for pylibraft to get versioned requirements.
 
@@ -18,9 +18,8 @@ def replace_requirements(func):
     @wraps(func)
     def wrapper(config_settings=None):
         orig_list = getattr(_orig, func.__name__)(config_settings)
-        append_list = [
-            f"rmm{os.getenv('RAPIDS_PY_WHEEL_CUDA_SUFFIX', default='')}"
-        ]
+        cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="")
+        append_list = [f"rmm{cuda_suffix}==23.2.*"]
         return orig_list + append_list
 
     return wrapper
diff --git a/python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd b/python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd
index 059512990e..c43f18ac3f 100644
--- a/python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd
+++ b/python/pylibraft/pylibraft/cluster/cpp/kmeans.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,14 +27,14 @@ from libcpp cimport bool, nullptr
 from pylibraft.cluster.cpp.kmeans_types cimport KMeansParams
 from pylibraft.common.cpp.mdspan cimport *
 from pylibraft.common.cpp.optional cimport optional
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 
 cdef extern from "raft_runtime/cluster/kmeans.hpp" \
         namespace "raft::runtime::cluster::kmeans" nogil:
 
     cdef void update_centroids(
-        const handle_t& handle,
+        const device_resources& handle,
         const double *X,
         int n_samples,
         int n_features,
@@ -46,7 +46,7 @@ cdef extern from "raft_runtime/cluster/kmeans.hpp" \
         double *weight_per_cluster) except +
 
     cdef void update_centroids(
-        const handle_t& handle,
+        const device_resources& handle,
         const float *X,
         int n_samples,
         int n_features,
@@ -58,7 +58,7 @@ cdef extern from "raft_runtime/cluster/kmeans.hpp" \
         float *weight_per_cluster) except +
 
     cdef void cluster_cost(
-        const handle_t& handle,
+        const device_resources& handle,
         const float* X,
         int n_samples,
         int n_features,
@@ -67,7 +67,7 @@ cdef extern from "raft_runtime/cluster/kmeans.hpp" \
         float * cost) except +
 
     cdef void cluster_cost(
-        const handle_t& handle,
+        const device_resources& handle,
         const double* X,
         int n_samples,
         int n_features,
@@ -76,7 +76,7 @@ cdef extern from "raft_runtime/cluster/kmeans.hpp" \
         double * cost) except +
 
     cdef void fit(
-        const handle_t & handle,
+        const device_resources & handle,
         const KMeansParams& params,
         device_matrix_view[float, int, row_major] X,
         optional[device_vector_view[float, int]] sample_weight,
@@ -85,7 +85,7 @@ cdef extern from "raft_runtime/cluster/kmeans.hpp" \
         host_scalar_view[int, int] n_iter) except +
 
     cdef void fit(
-        const handle_t & handle,
+        const device_resources & handle,
         const KMeansParams& params,
         device_matrix_view[double, int, row_major] X,
         optional[device_vector_view[double, int]] sample_weight,
diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx
index f2e010f6a5..1d0b9ad241 100644
--- a/python/pylibraft/pylibraft/cluster/kmeans.pyx
+++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ from enum import IntEnum
 from pylibraft.common import Handle, cai_wrapper, device_ndarray
 from pylibraft.common.handle import auto_sync_handle
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 from pylibraft.random.cpp.rng_state cimport RngState
 
 from pylibraft.common.input_validation import *
@@ -43,7 +43,7 @@ from pylibraft.cluster.cpp.kmeans cimport (
 )
 from pylibraft.common.cpp.mdspan cimport *
 from pylibraft.common.cpp.optional cimport optional
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 from pylibraft.common import auto_convert_output
 
@@ -159,7 +159,7 @@ def compute_new_centroids(X,
         weight_per_cluster_ptr = <uintptr_t>nullptr
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     x_c_contiguous = is_c_contiguous(x_cai)
     centroids_c_contiguous = is_c_contiguous(centroids_cai)
@@ -250,7 +250,7 @@ def cluster_cost(X, centroids, handle=None):
     centroids_ptr = <uintptr_t>centroids_cai["data"][0]
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     x_c_contiguous = is_c_contiguous(x_cai)
     centroids_c_contiguous = is_c_contiguous(centroids_cai)
@@ -452,7 +452,7 @@ def fit(
     >>> params = KMeansParams(n_clusters=n_clusters)
     >>> centroids, inertia, n_iter = fit(params, X)
     """
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     cdef float f_inertia = 0.0
     cdef double d_inertia = 0.0
diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
index 4f87720030..0385ae0899 100644
--- a/python/pylibraft/pylibraft/common/__init__.py
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,10 +13,11 @@
 # limitations under the License.
 #
 
+from .ai_wrapper import ai_wrapper
 from .cai_wrapper import cai_wrapper
 from .cuda import Stream
 from .device_ndarray import device_ndarray
-from .handle import Handle
+from .handle import DeviceResources, Handle
 from .outputs import auto_convert_output
 
-__all__ = ["Handle", "Stream"]
+__all__ = ["DeviceResources", "Handle", "Stream"]
diff --git a/python/pylibraft/pylibraft/common/ai_wrapper.py b/python/pylibraft/pylibraft/common/ai_wrapper.py
new file mode 100644
index 0000000000..b6b1f02187
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/ai_wrapper.py
@@ -0,0 +1,89 @@
+#
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+
+from pylibraft.common import input_validation
+
+
+class ai_wrapper:
+    """
+    Simple wrapper around a array interface object to reduce
+    boilerplate for extracting common information from the underlying
+    dictionary.
+    """
+
+    def __init__(self, ai_arr):
+        """
+        Constructor accepts an array interface compliant array
+
+        Parameters
+        ----------
+        ai_arr : array interface array
+        """
+        self.ai_ = ai_arr.__array_interface__
+
+    @property
+    def dtype(self):
+        """
+        Returns the dtype of the underlying array interface
+        """
+        return np.dtype(self.ai_["typestr"])
+
+    @property
+    def shape(self):
+        """
+        Returns the shape of the underlying array interface
+        """
+        return self.ai_["shape"]
+
+    @property
+    def c_contiguous(self):
+        """
+        Returns whether the underlying array interface has
+        c-ordered (row-major) layout
+        """
+        return input_validation.is_c_contiguous(self.ai_)
+
+    @property
+    def f_contiguous(self):
+        """
+        Returns whether the underlying array interface has
+        f-ordered (column-major) layout
+        """
+        return not input_validation.is_c_contiguous(self.ai_)
+
+    @property
+    def data(self):
+        """
+        Returns the data pointer of the underlying array interface
+        """
+        return self.ai_["data"][0]
+
+    def validate_shape_dtype(self, expected_dims=None, expected_dtype=None):
+        """Checks to see if the shape, dtype, and strides match expectations"""
+        if expected_dims is not None and len(self.shape) != expected_dims:
+            raise ValueError(
+                f"unexpected shape {self.shape} - "
+                f"expected {expected_dims} dimensions"
+            )
+
+        if expected_dtype is not None and self.dtype != expected_dtype:
+            raise ValueError(
+                f"invalid dtype {self.dtype}: expected " f"{expected_dtype}"
+            )
+
+        if not self.c_contiguous:
+            raise ValueError("input must be c-contiguous")
diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py
index 5851821f57..cf11ea29ce 100644
--- a/python/pylibraft/pylibraft/common/cai_wrapper.py
+++ b/python/pylibraft/pylibraft/common/cai_wrapper.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import numpy as np
+from types import SimpleNamespace
 
-from pylibraft.common import input_validation
+from pylibraft.common.ai_wrapper import ai_wrapper
 
 
-class cai_wrapper:
+class cai_wrapper(ai_wrapper):
     """
     Simple wrapper around a CUDA array interface object to reduce
     boilerplate for extracting common information from the underlying
@@ -33,57 +33,14 @@ def __init__(self, cai_arr):
         ----------
         cai_arr : CUDA array interface array
         """
-        self.cai_ = cai_arr.__cuda_array_interface__
+        helper = SimpleNamespace(
+            __array_interface__=cai_arr.__cuda_array_interface__
+        )
+        super().__init__(helper)
 
-    @property
-    def dtype(self):
-        """
-        Returns the dtype of the underlying CUDA array interface
-        """
-        return np.dtype(self.cai_["typestr"])
-
-    @property
-    def shape(self):
-        """
-        Returns the shape of the underlying CUDA array interface
-        """
-        return self.cai_["shape"]
-
-    @property
-    def c_contiguous(self):
-        """
-        Returns whether the underlying CUDA array interface has
-        c-ordered (row-major) layout
-        """
-        return input_validation.is_c_contiguous(self.cai_)
-
-    @property
-    def f_contiguous(self):
-        """
-        Returns whether the underlying CUDA array interface has
-        f-ordered (column-major) layout
-        """
-        return not input_validation.is_c_contiguous(self.cai_)
-
-    @property
-    def data(self):
-        """
-        Returns the data pointer of the underlying CUDA array interface
-        """
-        return self.cai_["data"][0]
-
-    def validate_shape_dtype(self, expected_dims=None, expected_dtype=None):
-        """Checks to see if the shape, dtype, and strides match expectations"""
-        if expected_dims is not None and len(self.shape) != expected_dims:
-            raise ValueError(
-                f"unexpected shape {self.shape} - "
-                f"expected {expected_dims} dimensions"
-            )
-
-        if expected_dtype is not None and self.dtype != expected_dtype:
-            raise ValueError(
-                f"invalid dtype {self.dtype}: expected " f"{expected_dtype}"
-            )
 
-        if not self.c_contiguous:
-            raise ValueError("input must be c-contiguous")
+def wrap_array(array):
+    try:
+        return cai_wrapper(array)
+    except AttributeError:
+        return ai_wrapper(array)
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
index 7400c8550f..c164a463ae 100644
--- a/python/pylibraft/pylibraft/common/cuda.pyx
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@ from cuda.ccudart cimport (
     cudaStreamSynchronize,
     cudaSuccess,
 )
+from libc.stdint cimport uintptr_t
 
 
 class CudaRuntimeError(RuntimeError):
@@ -80,3 +81,9 @@ cdef class Stream:
 
     cdef cudaStream_t getStream(self):
         return self.s
+
+    def get_ptr(self):
+        """
+        Return the uintptr_t pointer of the underlying cudaStream_t handle
+        """
+        return <uintptr_t>self.s
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index e763768eac..c090663547 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@ from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
 
+# Keeping `handle_t` around for backwards compatibility at the
+# cython layer but users are encourage to switch to device_resources
 cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
@@ -35,7 +37,17 @@ cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
         cuda_stream_view get_stream() except +
         void sync_stream() except +
 
-cdef class Handle:
-    cdef unique_ptr[handle_t] c_obj
+
+cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil:
+    cdef cppclass device_resources:
+        device_resources() except +
+        device_resources(cuda_stream_view stream_view) except +
+        device_resources(cuda_stream_view stream_view,
+                         shared_ptr[cuda_stream_pool] stream_pool) except +
+        cuda_stream_view get_stream() except +
+        void sync_stream() except +
+
+cdef class DeviceResources:
+    cdef unique_ptr[device_resources] c_obj
     cdef shared_ptr[cuda_stream_pool] stream_pool
     cdef int n_streams
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index 13fc7fc98e..b4cdb9b0c1 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 
 import functools
 
+from cuda.ccudart cimport cudaStream_t
+from libc.stdint cimport uintptr_t
+
 from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread, cuda_stream_view
 
 from .cuda cimport Stream
@@ -28,34 +31,63 @@ from .cuda cimport Stream
 from .cuda import CudaRuntimeError
 
 
-cdef class Handle:
+cdef class DeviceResources:
     """
-    Handle is a lightweight python wrapper around the corresponding C++ class
-    of handle_t exposed by RAFT's C++ interface. Refer to the header file
-    raft/handle.hpp for interface level details of this struct
+    DeviceResources is a lightweight python wrapper around the corresponding
+    C++ class of device_resources exposed by RAFT's C++ interface. Refer to
+    the header file raft/core/device_resources.hpp for interface level
+    details of this struct
+
+    Parameters
+    ----------
+    stream : Optional stream to use for ordering CUDA instructions
+             Accepts pylibraft.common.Stream() or uintptr_t (cudaStream_t)
 
     Examples
     --------
 
-    >>> from pylibraft.common import Stream, Handle
+    Basic usage:
+
+    >>> from pylibraft.common import Stream, DeviceResources
     >>> stream = Stream()
-    >>> handle = Handle(stream)
+    >>> handle = DeviceResources(stream)
     >>>
     >>> # call algos here
     >>>
     >>> # final sync of all work launched in the stream of this handle
     >>> # this is same as `raft.cuda.Stream.sync()` call, but safer in case
-    >>> # the default stream inside the `handle_t` is being used
+    >>> # the default stream inside the `device_resources` is being used
     >>> handle.sync()
     >>> del handle  # optional!
+
+    Using a cuPy stream with RAFT device_resources:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream, DeviceResources
+    >>>
+    >>> cupy_stream = cupy.cuda.Stream()
+    >>> handle = DeviceResources(stream=cupy_stream.ptr)
+
+    Using a RAFT stream with CuPy ExternalStream:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream
+    >>>
+    >>> raft_stream = Stream()
+    >>> cupy_stream = cupy.cuda.ExternalStream(raft_stream.get_ptr())
     """
 
-    def __cinit__(self, stream: Stream = None, n_streams=0):
+    def __cinit__(self, stream=None, n_streams=0):
         self.n_streams = n_streams
+
         if n_streams > 0:
             self.stream_pool.reset(new cuda_stream_pool(n_streams))
 
+        cdef uintptr_t s
         cdef cuda_stream_view c_stream
+
+        # We should either have a pylibraft.common.Stream or a uintptr_t
+        # of a cudaStream_t
         if stream is None:
             # this constructor will construct a "main" handle on
             # per-thread default stream, which is non-blocking
@@ -63,22 +95,98 @@ cdef class Handle:
                                           self.stream_pool))
         else:
             # this constructor constructs a handle on user stream
-            c_stream = cuda_stream_view(stream.getStream())
+            if isinstance(stream, Stream):
+                # Stream is pylibraft Stream()
+                s = stream.get_ptr()
+                c_stream = cuda_stream_view(<cudaStream_t>s)
+            elif isinstance(stream, int):
+                # Stream is a pointer, cast to cudaStream_t
+                s = stream
+                c_stream = cuda_stream_view(<cudaStream_t>s)
+            else:
+                raise ValueError("stream should be common.Stream() or "
+                                 "uintptr_t to cudaStream_t")
+
             self.c_obj.reset(new handle_t(c_stream,
-                                          self.stream_pool))
+                             self.stream_pool))
 
     def sync(self):
         """
-        Issues a sync on the stream set for this handle.
+        Issues a sync on the stream set for this instance.
         """
         self.c_obj.get()[0].sync_stream()
 
     def getHandle(self):
+        """
+        Return the pointer to the underlying raft::device_resources
+        instance as a size_t
+        """
         return <size_t> self.c_obj.get()
 
     def __getstate__(self):
         return self.n_streams
 
+    def __setstate__(self, state):
+        self.n_streams = state
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new device_resources(cuda_stream_per_thread,
+                                              self.stream_pool))
+
+
+cdef class Handle(DeviceResources):
+    """
+    Handle is a lightweight python wrapper around the corresponding
+    C++ class of handle_t exposed by RAFT's C++ interface. Refer to
+    the header file raft/core/handle.hpp for interface level
+    details of this struct
+
+    Note: This API is officially deprecated in favor of DeviceResources
+    and will be removed in a future release.
+
+    Parameters
+    ----------
+    stream : Optional stream to use for ordering CUDA instructions
+            Accepts pylibraft.common.Stream() or uintptr_t (cudaStream_t)
+
+    Examples
+    --------
+
+    Basic usage:
+
+    >>> from pylibraft.common import Stream, Handle
+    >>> stream = Stream()
+    >>> handle = Handle(stream)
+    >>>
+    >>> # call algos here
+    >>>
+    >>> # final sync of all work launched in the stream of this handle
+    >>> # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+    >>> # the default stream inside the `handle_t` is being used
+    >>> handle.sync()
+    >>> del handle  # optional!
+
+    Using a cuPy stream with RAFT device_resources:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream, Handle
+    >>>
+    >>> cupy_stream = cupy.cuda.Stream()
+    >>> handle = Handle(stream=cupy_stream.ptr)
+
+    Using a RAFT stream with CuPy ExternalStream:
+
+    >>> import cupy
+    >>> from pylibraft.common import Stream
+    >>>
+    >>> raft_stream = Stream()
+    >>> cupy_stream = cupy.cuda.ExternalStream(raft_stream.get_ptr())
+
+    """
+    def __getstate__(self):
+        return self.n_streams
+
     def __setstate__(self, state):
         self.n_streams = state
         if self.n_streams > 0:
@@ -89,11 +197,12 @@ cdef class Handle:
 
 
 _HANDLE_PARAM_DOCSTRING = """
-     handle : Optional RAFT handle for reusing expensive CUDA resources
-        If a handle isn't supplied, CUDA resources will be allocated
-        inside this function and synchronized before the function exits.
-        If a handle is supplied, you will need to explicitly synchronize
-        yourself by calling `handle.sync()` before accessing the output.
+     handle : Optional RAFT resource handle for reusing expensive CUDA
+        resources. If a handle isn't supplied, CUDA resources will be
+        allocated inside this function and synchronized before the
+        function exits. If a handle is supplied, you will need to
+        explicitly synchronize yourself by calling `handle.sync()`
+        before accessing the output.
 """.strip()
 
 
@@ -111,7 +220,7 @@ def auto_sync_handle(f):
     @functools.wraps(f)
     def wrapper(*args, handle=None, **kwargs):
         sync_handle = handle is None
-        handle = handle if handle is not None else Handle()
+        handle = handle if handle is not None else DeviceResources()
 
         ret_value = f(*args, handle=handle, **kwargs)
 
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
index c7f738f2e5..bb5415428f 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pyx
+++ b/python/pylibraft/pylibraft/common/interruptible.pyx
@@ -54,11 +54,17 @@ def cuda_interruptible():
         with nogil:
             dereference(token).cancel()
 
-    oldhr = signal.signal(signal.SIGINT, newhr)
+    try:
+        oldhr = signal.signal(signal.SIGINT, newhr)
+    except ValueError:
+        # the signal creation would fail if this is not the main thread
+        # That's fine! The feature is disabled.
+        oldhr = None
     try:
         yield
     finally:
-        signal.signal(signal.SIGINT, oldhr)
+        if oldhr is not None:
+            signal.signal(signal.SIGINT, oldhr)
 
 
 def synchronize(stream: Stream):
diff --git a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
index ce8e656822..c8e7101ee0 100644
--- a/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
+++ b/python/pylibraft/pylibraft/distance/fused_l2_nn.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,14 +34,14 @@ from pylibraft.common import (
 )
 from pylibraft.common.handle import auto_sync_handle
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 
 cdef extern from "raft_runtime/distance/fused_l2_nn.hpp" \
         namespace "raft::runtime::distance" nogil:
 
     void fused_l2_nn_min_arg(
-        const handle_t &handle,
+        const device_resources &handle,
         int* min,
         const float* x,
         const float* y,
@@ -51,7 +51,7 @@ cdef extern from "raft_runtime/distance/fused_l2_nn.hpp" \
         bool sqrt) except +
 
     void fused_l2_nn_min_arg(
-        const handle_t &handle,
+        const device_resources &handle,
         int* min,
         const double* x,
         const double* y,
@@ -154,7 +154,7 @@ def fused_l2_nn_argmin(X, Y, out=None, sqrt=True, handle=None):
     d_ptr = <uintptr_t>output_cai.data
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     d_dt = output_cai.dtype
 
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 2ed2b8ed57..9649531b61 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ from .distance_type cimport DistanceType
 from pylibraft.common import Handle
 from pylibraft.common.handle import auto_sync_handle
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 
@@ -37,7 +37,7 @@ from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 cdef extern from "raft_runtime/distance/pairwise_distance.hpp" \
         namespace "raft::runtime::distance" nogil:
 
-    cdef void pairwise_distance(const handle_t &handle,
+    cdef void pairwise_distance(const device_resources &handle,
                                 float *x,
                                 float *y,
                                 float *dists,
@@ -48,7 +48,7 @@ cdef extern from "raft_runtime/distance/pairwise_distance.hpp" \
                                 bool isRowMajor,
                                 float metric_arg) except +
 
-    cdef void pairwise_distance(const handle_t &handle,
+    cdef void pairwise_distance(const device_resources &handle,
                                 double *x,
                                 double *y,
                                 double *dists,
@@ -196,7 +196,7 @@ def distance(X, Y, out=None, metric="euclidean", p=2.0, handle=None):
     d_ptr = <uintptr_t>dists_cai.data
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     d_dt = dists_cai.dtype
 
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 1b8076487d..3a286868bf 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ from libcpp.string cimport string
 
 from rmm._lib.memory_resource cimport device_memory_resource
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 from pylibraft.distance.distance_type cimport DistanceType
 
 
@@ -80,7 +80,7 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         bool force_random_rotation
 
     cdef cppclass index[IdxT](ann_index):
-        index(const handle_t& handle,
+        index(const device_resources& handle,
               DistanceType metric,
               codebook_gen codebook_kind,
               uint32_t n_lists,
@@ -108,46 +108,46 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
 cdef extern from "raft_runtime/neighbors/ivf_pq.hpp" \
         namespace "raft::runtime::neighbors::ivf_pq" nogil:
 
-    cdef void build(const handle_t& handle,
+    cdef void build(const device_resources& handle,
                     const index_params& params,
                     const float* dataset,
                     uint64_t n_rows,
                     uint32_t dim,
                     index[uint64_t]* index) except +
 
-    cdef void build(const handle_t& handle,
+    cdef void build(const device_resources& handle,
                     const index_params& params,
                     const int8_t* dataset,
                     uint64_t n_rows,
                     uint32_t dim,
                     index[uint64_t]* index) except +
 
-    cdef void build(const handle_t& handle,
+    cdef void build(const device_resources& handle,
                     const index_params& params,
                     const uint8_t* dataset,
                     uint64_t n_rows,
                     uint32_t dim,
                     index[uint64_t]* index) except +
 
-    cdef void extend(const handle_t& handle,
+    cdef void extend(const device_resources& handle,
                      index[uint64_t]* index,
                      const float* new_vectors,
                      const uint64_t* new_indices,
                      uint64_t n_rows) except +
 
-    cdef void extend(const handle_t& handle,
+    cdef void extend(const device_resources& handle,
                      index[uint64_t]* index,
                      const int8_t* new_vectors,
                      const uint64_t* new_indices,
                      uint64_t n_rows) except +
 
-    cdef void extend(const handle_t& handle,
+    cdef void extend(const device_resources& handle,
                      index[uint64_t]* index,
                      const uint8_t* new_vectors,
                      const uint64_t* new_indices,
                      uint64_t n_rows) except +
 
-    cdef void search(const handle_t& handle,
+    cdef void search(const device_resources& handle,
                      const search_params& params,
                      const index[uint64_t]& index,
                      const float* queries,
@@ -157,7 +157,7 @@ cdef extern from "raft_runtime/neighbors/ivf_pq.hpp" \
                      float* distances,
                      device_memory_resource* mr) except +
 
-    cdef void search(const handle_t& handle,
+    cdef void search(const device_resources& handle,
                      const search_params& params,
                      const index[uint64_t]& index,
                      const int8_t* queries,
@@ -167,7 +167,7 @@ cdef extern from "raft_runtime/neighbors/ivf_pq.hpp" \
                      float* distances,
                      device_memory_resource* mr) except +
 
-    cdef void search(const handle_t& handle,
+    cdef void search(const device_resources& handle,
                      const search_params& params,
                      const index[uint64_t]& index,
                      const uint8_t* queries,
@@ -177,10 +177,10 @@ cdef extern from "raft_runtime/neighbors/ivf_pq.hpp" \
                      float* distances,
                      device_memory_resource* mr) except +
 
-    cdef void save(const handle_t& handle,
+    cdef void save(const device_resources& handle,
                    const string& filename,
                    const index[uint64_t]& index) except +
 
-    cdef void load(const handle_t& handle,
+    cdef void load(const device_resources& handle,
                    const string& filename,
                    index[uint64_t]* index) except +
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index a7137e4d08..42f508c969 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+import warnings
+
 import numpy as np
 
 from cython.operator cimport dereference as deref
@@ -35,14 +37,16 @@ from libcpp.string cimport string
 from pylibraft.distance.distance_type cimport DistanceType
 
 from pylibraft.common import (
-    Handle,
+    DeviceResources,
+    ai_wrapper,
     auto_convert_output,
     cai_wrapper,
     device_ndarray,
 )
+from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 from pylibraft.common.handle import auto_sync_handle
 from pylibraft.common.input_validation import is_c_contiguous
@@ -61,20 +65,24 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (
 
 def _get_metric(metric):
     SUPPORTED_DISTANCES = {
-        "l2_expanded": DistanceType.L2Expanded,
-        # TODO(tfeher): fix inconsistency: index building for L2SqrtExpanded is
-        # only supported by build, not by search.
-        # "euclidean": DistanceType.L2SqrtExpanded
+        "sqeuclidean": DistanceType.L2Expanded,
+        "euclidean": DistanceType.L2SqrtExpanded,
         "inner_product": DistanceType.InnerProduct
     }
     if metric not in SUPPORTED_DISTANCES:
+        if metric == "l2_expanded":
+            warnings.warn("Using l2_expanded as a metric name is deprecated,"
+                          " use sqeuclidean instead", FutureWarning)
+            return DistanceType.L2Expanded
+
         raise ValueError("metric %s is not supported" % metric)
     return SUPPORTED_DISTANCES[metric]
 
 
 cdef _get_metric_string(DistanceType metric):
-    return {DistanceType.L2Expanded : "l2_expanded",
-            DistanceType.InnerProduct: "inner_product"}[metric]
+    return {DistanceType.L2Expanded : "sqeuclidean",
+            DistanceType.InnerProduct: "inner_product",
+            DistanceType.L2SqrtExpanded: "euclidean"}[metric]
 
 
 cdef _get_codebook_string(c_ivf_pq.codebook_gen codebook):
@@ -117,7 +125,7 @@ cdef class IndexParams:
 
     def __init__(self, *,
                  n_lists=1024,
-                 metric="l2_expanded",
+                 metric="sqeuclidean",
                  kmeans_n_iters=20,
                  kmeans_trainset_fraction=0.5,
                  pq_bits=8,
@@ -132,10 +140,12 @@ cdef class IndexParams:
         ----------
         n_list : int, default = 1024
             The number of clusters used in the coarse quantizer.
-        metric : string denoting the metric type, default="l2_expanded"
-            Valid values for metric: ["l2_expanded", "inner_product"], where
-            - l2_expanded is the equclidean distance without the square root
+        metric : string denoting the metric type, default="sqeuclidean"
+            Valid values for metric: ["sqeuclidean", "inner_product",
+            "euclidean"], where
+            - sqeuclidean is the euclidean distance without the square root
               operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - euclidean is the euclidean distance
             - inner product distance is defined as
               distance(a, b) = \\sum_i a_i * b_i.
         kmeans_n_iters : int, default = 20
@@ -242,13 +252,14 @@ cdef class Index:
         self.trained = False
         self.index = NULL
         if handle is None:
-            handle = Handle()
-        cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+            handle = DeviceResources()
+        cdef device_resources* handle_ = \
+            <device_resources*><size_t>handle.getHandle()
 
         # We create a placeholder object. The actual parameter values do
         # not matter, it will be replaced with a built index object later.
         self.index = new c_ivf_pq.index[uint64_t](
-            deref(handle_), _get_metric("l2_expanded"),
+            deref(handle_), _get_metric("sqeuclidean"),
             c_ivf_pq.codebook_gen.PER_SUBSPACE,
             <uint32_t>1,
             <uint32_t>4,
@@ -313,10 +324,13 @@ def build(IndexParams index_params, dataset, handle=None):
     """
     Builds an IVF-PQ index that can be later used for nearest neighbor search.
 
+    The input array can be either CUDA array interface compliant matrix or
+    array interface compliant matrix in host memory.
+
     Parameters
     ----------
     index_params : IndexParams object
-    dataset : CUDA array interface compliant matrix shape (n_samples, dim)
+    dataset : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int8, uint8]
     {handle_docstring}
 
@@ -329,7 +343,7 @@ def build(IndexParams index_params, dataset, handle=None):
 
     >>> import cupy as cp
 
-    >>> from pylibraft.common import Handle
+    >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
 
     >>> n_samples = 50000
@@ -338,10 +352,10 @@ def build(IndexParams index_params, dataset, handle=None):
 
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-    >>> handle = Handle()
+    >>> handle = DeviceResources()
     >>> index_params = ivf_pq.IndexParams(
     ...     n_lists=1024,
-    ...     metric="l2_expanded",
+    ...     metric="sqeuclidean",
     ...     pq_dim=10)
     >>> index = ivf_pq.build(index_params, dataset, handle=handle)
 
@@ -359,7 +373,7 @@ def build(IndexParams index_params, dataset, handle=None):
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
     """
-    dataset_cai = cai_wrapper(dataset)
+    dataset_cai = wrap_array(dataset)
     dataset_dt = dataset_cai.dtype
     _check_input_array(dataset_cai, [np.dtype('float32'), np.dtype('byte'),
                                      np.dtype('ubyte')])
@@ -369,8 +383,9 @@ def build(IndexParams index_params, dataset, handle=None):
     cdef uint32_t dim = dataset_cai.shape[1]
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     idx = Index()
 
@@ -413,14 +428,16 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     """
     Extend an existing index with new vectors.
 
+    The input array can be either CUDA array interface compliant matrix or
+    array interface compliant matrix in host memory.
 
     Parameters
     ----------
     index : ivf_pq.Index
         Trained ivf_pq object.
-    new_vectors : CUDA array interface compliant matrix shape (n_samples, dim)
+    new_vectors : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int8, uint8]
-    new_indices : CUDA array interface compliant matrix shape (n_samples, dim)
+    new_indices : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [uint64]
     {handle_docstring}
 
@@ -433,7 +450,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
 
     >>> import cupy as cp
 
-    >>> from pylibraft.common import Handle
+    >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
 
     >>> n_samples = 50000
@@ -442,7 +459,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
 
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-    >>> handle = Handle()
+    >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
 
     >>> n_rows = 100
@@ -470,10 +487,11 @@ def extend(Index index, new_vectors, new_indices, handle=None):
         raise ValueError("Index need to be built before calling extend.")
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
-    vecs_cai = cai_wrapper(new_vectors)
+    vecs_cai = wrap_array(new_vectors)
     vecs_dt = vecs_cai.dtype
     cdef uint64_t n_rows = vecs_cai.shape[0]
     cdef uint32_t dim = vecs_cai.shape[1]
@@ -482,7 +500,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
                                   np.dtype('ubyte')],
                        exp_cols=index.dim)
 
-    idx_cai = cai_wrapper(new_indices)
+    idx_cai = wrap_array(new_indices)
     _check_input_array(idx_cai, [np.dtype('uint64')], exp_rows=n_rows)
     if len(idx_cai.shape)!=1:
         raise ValueError("Indices array is expected to be 1D")
@@ -611,7 +629,7 @@ def search(SearchParams search_params,
     --------
     >>> import cupy as cp
 
-    >>> from pylibraft.common import Handle
+    >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
 
     >>> n_samples = 50000
@@ -621,7 +639,7 @@ def search(SearchParams search_params,
     ...                                   dtype=cp.float32)
 
     >>> # Build index
-    >>> handle = Handle()
+    >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
 
     >>> # Search using the built index
@@ -659,8 +677,9 @@ def search(SearchParams search_params,
         raise ValueError("Index need to be built before calling search.")
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     queries_cai = cai_wrapper(queries)
     queries_dt = queries_cai.dtype
@@ -753,7 +772,7 @@ def save(filename, Index index, handle=None):
     --------
     >>> import cupy as cp
 
-    >>> from pylibraft.common import Handle
+    >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
 
     >>> n_samples = 50000
@@ -762,7 +781,7 @@ def save(filename, Index index, handle=None):
     ...                                   dtype=cp.float32)
 
     >>> # Build index
-    >>> handle = Handle()
+    >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
     >>> ivf_pq.save("my_index.bin", index, handle=handle)
     """
@@ -770,8 +789,9 @@ def save(filename, Index index, handle=None):
         raise ValueError("Index need to be built before saving it.")
 
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     cdef string c_filename = filename.encode('utf-8')
 
@@ -801,7 +821,7 @@ def load(filename, handle=None):
     --------
     >>> import cupy as cp
 
-    >>> from pylibraft.common import Handle
+    >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
 
     >>> n_samples = 50000
@@ -810,7 +830,7 @@ def load(filename, handle=None):
     ...                                   dtype=cp.float32)
 
     >>> # Build and save index
-    >>> handle = Handle()
+    >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
     >>> ivf_pq.save("my_index.bin", index, handle=handle)
     >>> del index
@@ -818,15 +838,16 @@ def load(filename, handle=None):
     >>> n_queries = 100
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
-    >>> handle = Handle()
+    >>> handle = DeviceResources()
     >>> index = ivf_pq.load("my_index.bin", handle=handle)
 
     >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
     ...                                      queries, k=10, handle=handle)
     """
     if handle is None:
-        handle = Handle()
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+        handle = DeviceResources()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     cdef string c_filename = filename.encode('utf-8')
     index = Index()
diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx
index 37ef69e7b5..5c652f7c73 100644
--- a/python/pylibraft/pylibraft/neighbors/refine.pyx
+++ b/python/pylibraft/pylibraft/neighbors/refine.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,13 +34,13 @@ from libcpp cimport bool, nullptr
 from pylibraft.distance.distance_type cimport DistanceType
 
 from pylibraft.common import (
-    Handle,
+    DeviceResources,
     auto_convert_output,
     cai_wrapper,
     device_ndarray,
 )
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 
 from pylibraft.common.handle import auto_sync_handle
 from pylibraft.common.input_validation import is_c_contiguous
@@ -71,7 +71,7 @@ cdef extern from "raft_runtime/neighbors/refine.hpp" \
         namespace "raft::runtime::neighbors" nogil:
 
     cdef void c_refine "raft::runtime::neighbors::refine" (
-        const handle_t& handle,
+        const device_resources& handle,
         device_matrix_view[float, uint64_t, row_major] dataset,
         device_matrix_view[float, uint64_t, row_major] queries,
         device_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -80,7 +80,7 @@ cdef extern from "raft_runtime/neighbors/refine.hpp" \
         DistanceType metric) except +
 
     cdef void c_refine "raft::runtime::neighbors::refine" (
-        const handle_t& handle,
+        const device_resources& handle,
         device_matrix_view[uint8_t, uint64_t, row_major] dataset,
         device_matrix_view[uint8_t, uint64_t, row_major] queries,
         device_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -89,7 +89,7 @@ cdef extern from "raft_runtime/neighbors/refine.hpp" \
         DistanceType metric) except +
 
     cdef void c_refine "raft::runtime::neighbors::refine" (
-        const handle_t& handle,
+        const device_resources& handle,
         device_matrix_view[int8_t, uint64_t, row_major] dataset,
         device_matrix_view[int8_t, uint64_t, row_major] queries,
         device_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -98,7 +98,7 @@ cdef extern from "raft_runtime/neighbors/refine.hpp" \
         DistanceType metric) except +
 
     cdef void c_refine "raft::runtime::neighbors::refine" (
-        const handle_t& handle,
+        const device_resources& handle,
         host_matrix_view[float, uint64_t, row_major] dataset,
         host_matrix_view[float, uint64_t, row_major] queries,
         host_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -107,7 +107,7 @@ cdef extern from "raft_runtime/neighbors/refine.hpp" \
         DistanceType metric) except +
 
     cdef void c_refine "raft::runtime::neighbors::refine" (
-        const handle_t& handle,
+        const device_resources& handle,
         host_matrix_view[uint8_t, uint64_t, row_major] dataset,
         host_matrix_view[uint8_t, uint64_t, row_major] queries,
         host_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -116,7 +116,7 @@ cdef extern from "raft_runtime/neighbors/refine.hpp" \
         DistanceType metric) except +
 
     cdef void c_refine "raft::runtime::neighbors::refine" (
-        const handle_t& handle,
+        const device_resources& handle,
         host_matrix_view[int8_t, uint64_t, row_major] dataset,
         host_matrix_view[int8_t, uint64_t, row_major] queries,
         host_matrix_view[uint64_t, uint64_t, row_major] candidates,
@@ -215,7 +215,7 @@ cdef host_matrix_view[int8_t, uint64_t, row_major] \
 @auto_sync_handle
 @auto_convert_output
 def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
-           metric="l2_expanded", handle=None):
+           metric="sqeuclidean", handle=None):
     """
     Refine nearest neighbor search.
 
@@ -261,7 +261,7 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
 
     >>> import cupy as cp
 
-    >>> from pylibraft.common import Handle
+    >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq, refine
 
     >>> n_samples = 50000
@@ -270,8 +270,8 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
 
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-    >>> handle = Handle()
-    >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="l2_expanded",
+    >>> handle = DeviceResources()
+    >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean",
     ...                                   pq_dim=10)
     >>> index = ivf_pq.build(index_params, dataset, handle=handle)
 
@@ -294,7 +294,7 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
     """
 
     if handle is None:
-        handle = Handle()
+        handle = DeviceResources()
 
     if hasattr(dataset, "__cuda_array_interface__"):
         return _refine_device(dataset, queries, candidates, k, indices,
@@ -306,7 +306,8 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
 
 def _refine_device(dataset, queries, candidates, k, indices, distances,
                    metric, handle):
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     cdef device_matrix_view[uint64_t, uint64_t, row_major] candidates_view = \
         get_device_matrix_view_uint64(candidates)
@@ -367,7 +368,8 @@ def _refine_device(dataset, queries, candidates, k, indices, distances,
 
 def _refine_host(dataset, queries, candidates, k, indices, distances,
                  metric, handle):
-    cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
 
     if k is None:
         if indices is not None:
diff --git a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
index 56d6ced468..2c7e0430b4 100644
--- a/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
+++ b/python/pylibraft/pylibraft/random/rmat_rectangular_generator.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,14 +28,14 @@ from pylibraft.common.handle import auto_sync_handle
 
 from libcpp cimport bool
 
-from pylibraft.common.handle cimport handle_t
+from pylibraft.common.handle cimport device_resources
 from pylibraft.random.cpp.rng_state cimport RngState
 
 
 cdef extern from "raft_runtime/random/rmat_rectangular_generator.hpp" \
         namespace "raft::runtime::random" nogil:
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int* out,
                                    int* out_src,
                                    int* out_dst,
@@ -45,7 +45,7 @@ cdef extern from "raft_runtime/random/rmat_rectangular_generator.hpp" \
                                    int n_edges,
                                    RngState& r) except +
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int64_t* out,
                                    int64_t* out_src,
                                    int64_t* out_dst,
@@ -55,7 +55,7 @@ cdef extern from "raft_runtime/random/rmat_rectangular_generator.hpp" \
                                    int64_t n_edges,
                                    RngState& r) except +
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int* out,
                                    int* out_src,
                                    int* out_dst,
@@ -65,7 +65,7 @@ cdef extern from "raft_runtime/random/rmat_rectangular_generator.hpp" \
                                    int n_edges,
                                    RngState& r) except +
 
-    cdef void rmat_rectangular_gen(const handle_t &handle,
+    cdef void rmat_rectangular_gen(const device_resources &handle,
                                    int64_t* out,
                                    int64_t* out_src,
                                    int64_t* out_dst,
@@ -138,7 +138,7 @@ def rmat(out, theta, r_scale, c_scale, seed=12345, handle=None):
     cdef RngState *rng = new RngState(seed)
 
     handle = handle if handle is not None else Handle()
-    cdef handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef device_resources *h = <device_resources*><size_t>handle.getHandle()
 
     if out_dt == np.int32 and theta_dt == np.float32:
         rmat_rectangular_gen(deref(h),
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
index a08656d3aa..dd6050a098 100644
--- a/python/pylibraft/pylibraft/test/test_distance.py
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, Stream, device_ndarray
 from pylibraft.distance import pairwise_distance
 
 
@@ -64,9 +64,10 @@ def test_distance(n_rows, n_cols, inplace, metric, order, dtype):
     input1_device = device_ndarray(input1)
     output_device = device_ndarray(output) if inplace else None
 
-    handle = Handle()
+    s2 = Stream()
+    handle = DeviceResources(stream=s2)
     ret_output = pairwise_distance(
-        input1_device, input1_device, output_device, metric
+        input1_device, input1_device, output_device, metric, handle=handle
     )
     handle.sync()
 
diff --git a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
index b05ad3d530..086bb26f17 100644
--- a/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
+++ b/python/pylibraft/pylibraft/test/test_fused_l2_argmin.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 import pytest
 from scipy.spatial.distance import cdist
 
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, device_ndarray
 from pylibraft.distance import fused_l2_nn_argmin
 
 
@@ -42,7 +42,7 @@ def test_fused_l2_nn_minarg(n_rows, n_cols, n_clusters, dtype, inplace):
     input2_device = device_ndarray(input2)
     output_device = device_ndarray(output) if inplace else None
 
-    handle = Handle()
+    handle = DeviceResources()
     ret_output = fused_l2_nn_argmin(
         input1_device, input2_device, output_device, True, handle=handle
     )
diff --git a/python/pylibraft/pylibraft/test/test_handle.py b/python/pylibraft/pylibraft/test/test_handle.py
new file mode 100644
index 0000000000..ae519ea965
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_handle.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import DeviceResources, Stream, device_ndarray
+from pylibraft.distance import pairwise_distance
+
+try:
+    import cupy
+except ImportError:
+    pytest.skip(reason="cupy not installed.")
+
+
+@pytest.mark.parametrize("stream", [cupy.cuda.Stream().ptr, Stream()])
+def test_handle_external_stream(stream):
+
+    input1 = np.random.random_sample((50, 3))
+    input1 = np.asarray(input1, order="F").astype("float")
+
+    output = np.zeros((50, 50), dtype="float")
+
+    input1_device = device_ndarray(input1)
+    output_device = device_ndarray(output)
+
+    # We are just testing that this doesn't segfault
+    handle = DeviceResources(stream)
+    pairwise_distance(
+        input1_device, input1_device, output_device, "euclidean", handle=handle
+    )
+    handle.sync()
+
+    with pytest.raises(ValueError):
+        handle = DeviceResources(stream=1.0)
diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py
index 2c6e0dd14c..977d365633 100644
--- a/python/pylibraft/pylibraft/test/test_ivf_pq.py
+++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,18 +58,15 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None):
     for i in range(queries.shape[0]):
         X = queries[np.newaxis, i, :]
         Y = dataset[out_idx[i, :], :]
-        if metric == "l2_expanded":
+        if metric == "sqeuclidean":
+            dist[i, :] = pairwise_distances(X, Y, "sqeuclidean")
+        elif metric == "euclidean":
             dist[i, :] = pairwise_distances(X, Y, "euclidean")
         elif metric == "inner_product":
             dist[i, :] = np.matmul(X, Y.T)
         else:
             raise ValueError("Invalid metric")
 
-    # Note: raft l2 metric does not include the square root operation like
-    # sklearn's euclidean.
-    if metric == "l2_expanded":
-        dist = np.power(dist, 2)
-
     dist_eps = abs(dist)
     dist_eps[dist < 1e-3] = 1e-3
     diff = abs(out_dist - dist) / dist_eps
@@ -97,6 +94,7 @@ def run_ivf_pq_build_search_test(
     kmeans_n_iters=20,
     compare=True,
     inplace=True,
+    array_type="device",
 ):
     dataset = generate_data((n_rows, n_cols), dtype)
     if metric == "inner_product":
@@ -115,7 +113,10 @@ def run_ivf_pq_build_search_test(
         add_data_on_build=add_data_on_build,
     )
 
-    index = ivf_pq.build(build_params, dataset_device)
+    if array_type == "device":
+        index = ivf_pq.build(build_params, dataset_device)
+    else:
+        index = ivf_pq.build(build_params, dataset)
 
     assert index.trained
     if pq_dim != 0:
@@ -125,14 +126,20 @@ def run_ivf_pq_build_search_test(
     assert index.n_lists == build_params.n_lists
 
     if not add_data_on_build:
-        dataset_1_device = device_ndarray(dataset[: n_rows // 2, :])
-        dataset_2_device = device_ndarray(dataset[n_rows // 2 :, :])
+        dataset_1 = dataset[: n_rows // 2, :]
+        dataset_2 = dataset[n_rows // 2 :, :]
         indices_1 = np.arange(n_rows // 2, dtype=np.uint64)
-        indices_1_device = device_ndarray(indices_1)
         indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint64)
-        indices_2_device = device_ndarray(indices_2)
-        index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
-        index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
+        if array_type == "device":
+            dataset_1_device = device_ndarray(dataset_1)
+            dataset_2_device = device_ndarray(dataset_2)
+            indices_1_device = device_ndarray(indices_1)
+            indices_2_device = device_ndarray(indices_2)
+            index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
+            index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
+        else:
+            index = ivf_pq.extend(index, dataset_1, indices_1)
+            index = ivf_pq.extend(index, dataset_2, indices_2)
 
     assert index.size >= n_rows
 
@@ -169,9 +176,11 @@ def run_ivf_pq_build_search_test(
     out_dist = out_dist_device.copy_to_host()
 
     # Calculate reference values with sklearn
-    skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
-        metric
-    ]
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
     nn_skl = NearestNeighbors(
         n_neighbors=k, algorithm="brute", metric=skl_metric
     )
@@ -190,18 +199,22 @@ def run_ivf_pq_build_search_test(
 @pytest.mark.parametrize("n_queries", [100])
 @pytest.mark.parametrize("n_lists", [100])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
-def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
+@pytest.mark.parametrize("array_type", ["host", "device"])
+def test_ivf_pq_dtypes(
+    n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type
+):
     # Note that inner_product tests use normalized input which we cannot
-    # represent in int8, therefore we test only l2_expanded metric here.
+    # represent in int8, therefore we test only sqeuclidean metric here.
     run_ivf_pq_build_search_test(
         n_rows=n_rows,
         n_cols=n_cols,
         n_queries=n_queries,
         k=10,
         n_lists=n_lists,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=dtype,
         inplace=inplace,
+        array_type=array_type,
     )
 
 
@@ -218,7 +231,7 @@ def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
             },
             marks=pytest.mark.xfail(reason="empty dataset"),
         ),
-        {"n_rows": 1, "n_cols": 10, "n_queries": 10, "k": 1, "n_lists": 10},
+        {"n_rows": 1, "n_cols": 10, "n_queries": 10, "k": 1, "n_lists": 1},
         {"n_rows": 10, "n_cols": 1, "n_queries": 10, "k": 10, "n_lists": 10},
         # {"n_rows": 999, "n_cols": 42, "n_queries": 453, "k": 137,
         #  "n_lists": 53},
@@ -233,13 +246,15 @@ def test_ivf_pq_n(params):
         n_queries=params["n_queries"],
         k=params["k"],
         n_lists=params["n_lists"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         compare=False,
     )
 
 
-@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
+@pytest.mark.parametrize(
+    "metric", ["sqeuclidean", "inner_product", "euclidean"]
+)
 @pytest.mark.parametrize("dtype", [np.float32])
 @pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"])
 @pytest.mark.parametrize("rotation", [True, False])
@@ -283,7 +298,7 @@ def test_ivf_pq_params(params):
         n_queries=1000,
         k=10,
         n_lists=params["n_lists"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         pq_bits=params["pq_bits"],
         pq_dim=params["pq_dims"],
@@ -329,7 +344,7 @@ def test_ivf_pq_search_params(params):
         k=params["k"],
         n_lists=100,
         n_probes=params["n_probes"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         lut_dtype=params["lut"],
         internal_distance_dtype=params["idd"],
@@ -337,16 +352,18 @@ def test_ivf_pq_search_params(params):
 
 
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
-def test_extend(dtype):
+@pytest.mark.parametrize("array_type", ["host", "device"])
+def test_extend(dtype, array_type):
     run_ivf_pq_build_search_test(
         n_rows=10000,
         n_cols=10,
         n_queries=100,
         k=10,
         n_lists=100,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=dtype,
         add_data_on_build=False,
+        array_type=array_type,
     )
 
 
@@ -358,7 +375,7 @@ def test_build_assertions():
             n_queries=100,
             k=10,
             n_lists=100,
-            metric="l2_expanded",
+            metric="sqeuclidean",
             dtype=np.float64,
         )
 
@@ -371,7 +388,7 @@ def test_build_assertions():
 
     index_params = ivf_pq.IndexParams(
         n_lists=50,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         kmeans_n_iters=20,
         kmeans_trainset_fraction=1,
         add_data_on_build=False,
@@ -465,7 +482,7 @@ def test_search_inputs(params):
     out_dist_device = device_ndarray(out_dist)
 
     index_params = ivf_pq.IndexParams(
-        n_lists=50, metric="l2_expanded", add_data_on_build=True
+        n_lists=50, metric="sqeuclidean", add_data_on_build=True
     )
 
     dataset = generate_data((n_rows, n_cols), dtype)
@@ -494,7 +511,7 @@ def test_save_load():
     dataset = generate_data((n_rows, n_cols), dtype)
     dataset_device = device_ndarray(dataset)
 
-    build_params = ivf_pq.IndexParams(n_lists=100, metric="l2_expanded")
+    build_params = ivf_pq.IndexParams(n_lists=100, metric="sqeuclidean")
     index = ivf_pq.build(build_params, dataset_device)
 
     assert index.trained
diff --git a/python/pylibraft/pylibraft/test/test_kmeans.py b/python/pylibraft/pylibraft/test/test_kmeans.py
index e5e544d565..4c2388de62 100644
--- a/python/pylibraft/pylibraft/test/test_kmeans.py
+++ b/python/pylibraft/pylibraft/test/test_kmeans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
     compute_new_centroids,
     fit,
 )
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, device_ndarray
 from pylibraft.distance import pairwise_distance
 
 
@@ -64,7 +64,7 @@ def test_compute_new_centroids(
 
     # A single RAFT handle can optionally be reused across
     # pylibraft functions.
-    handle = Handle()
+    handle = DeviceResources()
 
     X = np.random.random_sample((n_rows, n_cols)).astype(dtype)
     X_device = device_ndarray(X)
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/test/test_random.py
index 229baffff5..76c0f53d3e 100644
--- a/python/pylibraft/pylibraft/test/test_random.py
+++ b/python/pylibraft/pylibraft/test/test_random.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-from pylibraft.common import Handle, device_ndarray
+from pylibraft.common import DeviceResources, device_ndarray
 from pylibraft.random import rmat
 
 
@@ -46,7 +46,7 @@ def test_rmat(n_edges, r_scale, c_scale, dtype):
     out_buff = np.empty((n_edges, 2), dtype=dtype)
     output_device = device_ndarray(out_buff)
 
-    handle = Handle()
+    handle = DeviceResources()
     rmat(output_device, theta_device, r_scale, c_scale, 12345, handle=handle)
     handle.sync()
     output = output_device.copy_to_host()
diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py
index 49e4e71f9a..8502d0575c 100644
--- a/python/pylibraft/pylibraft/test/test_refine.py
+++ b/python/pylibraft/pylibraft/test/test_refine.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ def run_refine(
     n_rows=500,
     n_cols=50,
     n_queries=100,
-    metric="l2_expanded",
+    metric="sqeuclidean",
     k0=40,
     k=10,
     inplace=False,
@@ -49,7 +49,7 @@ def run_refine(
     queries_device = device_ndarray(queries)
 
     # Calculate reference values with sklearn
-    skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
+    skl_metric = {"sqeuclidean": "euclidean", "inner_product": "cosine"}[
         metric
     ]
     nn_skl = NearestNeighbors(
@@ -106,7 +106,7 @@ def run_refine(
     if recall <= 0.999:
         # We did not find the same neighbor indices.
         # We could have found other neighbor with same distance.
-        if metric == "l2_expanded":
+        if metric == "sqeuclidean":
             skl_dist = np.power(skl_dist[:, :k], 2)
         elif metric == "inner_product":
             skl_dist = 1 - skl_dist[:, :k]
@@ -120,12 +120,10 @@ def run_refine(
 
 @pytest.mark.parametrize("n_queries", [100, 1024, 37])
 @pytest.mark.parametrize("inplace", [True, False])
-@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("memory_type", ["device", "host"])
 def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type):
-    if memory_type == "device" and dtype == np.int8:
-        pytest.xfail("Possibly incorrect distance calculation (IVF-Flat)")
     run_refine(
         n_rows=2000,
         n_queries=n_queries,
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
index 15889fcd71..230b9127e3 100644
--- a/python/pylibraft/setup.py
+++ b/python/pylibraft/setup.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 install_requires = [
     "numpy",
     "cuda-python>=11.7.1,<12.0",
-    f"rmm{cuda_suffix}",
+    f"rmm{cuda_suffix}==23.2.*",
 ]
 
 extras_require = {
@@ -69,6 +69,7 @@ def get_versions():
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
     author="NVIDIA Corporation",
     include_package_data=True,
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index fc93a2ddc2..8486523226 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(raft_dask_version 23.02.00)
+set(raft_dask_version 23.04.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -69,6 +69,8 @@ if(NOT raft_FOUND)
   endif()
 
   add_subdirectory(../../cpp raft-cpp ${_exclude_from_all})
+  list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
+  find_package(NCCL REQUIRED)
 endif()
 
 include(rapids-cython)
diff --git a/python/raft-dask/raft_dask/common/comms_utils.pyx b/python/raft-dask/raft_dask/common/comms_utils.pyx
index 7db04ef455..768ba0e422 100644
--- a/python/raft-dask/raft_dask/common/comms_utils.pyx
+++ b/python/raft-dask/raft_dask/common/comms_utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,41 +30,49 @@ cdef extern from "nccl.h":
     ctypedef ncclComm *ncclComm_t
 
 cdef extern from "raft/core/handle.hpp" namespace "raft":
-    cdef cppclass handle_t:
-        handle_t() except +
+    cdef cppclass device_resources:
+        device_resources() except +
+
+cdef extern from "raft/core/device_resources.hpp" namespace "raft":
+    cdef cppclass device_resources:
+        device_resources() except +
 
 cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
 
-    void build_comms_nccl_ucx(handle_t *handle,
+    void build_comms_nccl_ucx(device_resources *handle,
                               ncclComm_t comm,
                               void *ucp_worker,
                               void *eps,
                               int size,
                               int rank) except +
 
-    void build_comms_nccl_only(handle_t *handle,
+    void build_comms_nccl_only(device_resources *handle,
                                ncclComm_t comm,
                                int size,
                                int rank) except +
 
 cdef extern from "raft/comms/comms_test.hpp" namespace "raft::comms":
 
-    bool test_collective_allreduce(const handle_t &h, int root) except +
-    bool test_collective_broadcast(const handle_t &h, int root) except +
-    bool test_collective_reduce(const handle_t &h, int root) except +
-    bool test_collective_allgather(const handle_t &h, int root) except +
-    bool test_collective_gather(const handle_t &h, int root) except +
-    bool test_collective_gatherv(const handle_t &h, int root) except +
-    bool test_collective_reducescatter(const handle_t &h, int root) except +
-    bool test_pointToPoint_simple_send_recv(const handle_t &h,
+    bool test_collective_allreduce(const device_resources &h, int root) \
+        except +
+    bool test_collective_broadcast(const device_resources &h, int root) \
+        except +
+    bool test_collective_reduce(const device_resources &h, int root) except +
+    bool test_collective_allgather(const device_resources &h, int root) \
+        except +
+    bool test_collective_gather(const device_resources &h, int root) except +
+    bool test_collective_gatherv(const device_resources &h, int root) except +
+    bool test_collective_reducescatter(const device_resources &h, int root) \
+        except +
+    bool test_pointToPoint_simple_send_recv(const device_resources &h,
                                             int numTrials) except +
-    bool test_pointToPoint_device_send_or_recv(const handle_t &h,
+    bool test_pointToPoint_device_send_or_recv(const device_resources &h,
                                                int numTrials) except +
-    bool test_pointToPoint_device_sendrecv(const handle_t &h,
+    bool test_pointToPoint_device_sendrecv(const device_resources &h,
                                            int numTrials) except +
-    bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
+    bool test_pointToPoint_device_multicast_sendrecv(const device_resources &h,
                                                      int numTrials) except +
-    bool test_commsplit(const handle_t &h, int n_colors) except +
+    bool test_commsplit(const device_resources &h, int n_colors) except +
 
 
 def perform_test_comms_allreduce(handle, root):
@@ -76,7 +84,8 @@ def perform_test_comms_allreduce(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_allreduce(deref(h), root)
 
 
@@ -89,7 +98,8 @@ def perform_test_comms_reduce(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_reduce(deref(h), root)
 
 
@@ -102,7 +112,8 @@ def perform_test_comms_reducescatter(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_reducescatter(deref(h), root)
 
 
@@ -115,7 +126,8 @@ def perform_test_comms_bcast(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_broadcast(deref(h), root)
 
 
@@ -128,7 +140,8 @@ def perform_test_comms_allgather(handle, root):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_allgather(deref(h), root)
 
 
@@ -143,7 +156,8 @@ def perform_test_comms_gather(handle, root):
     root : int
            Rank of the root worker
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_gather(deref(h), root)
 
 
@@ -158,7 +172,8 @@ def perform_test_comms_gatherv(handle, root):
     root : int
            Rank of the root worker
     """
-    cdef const handle_t* h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources* h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_collective_gatherv(deref(h), root)
 
 
@@ -173,7 +188,8 @@ def perform_test_comms_send_recv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_pointToPoint_simple_send_recv(deref(h), <int>n_trials)
 
 
@@ -188,7 +204,8 @@ def perform_test_comms_device_send_or_recv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_pointToPoint_device_send_or_recv(deref(h), <int>n_trials)
 
 
@@ -203,7 +220,8 @@ def perform_test_comms_device_sendrecv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources*><size_t>handle.getHandle()
     return test_pointToPoint_device_sendrecv(deref(h), <int>n_trials)
 
 
@@ -218,7 +236,8 @@ def perform_test_comms_device_multicast_sendrecv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t *> <size_t> handle.getHandle()
+    cdef const device_resources *h = \
+        <device_resources *> <size_t> handle.getHandle()
     return test_pointToPoint_device_multicast_sendrecv(deref(h), <int>n_trials)
 
 
@@ -231,7 +250,8 @@ def perform_test_comm_split(handle, n_colors):
     handle : raft.common.Handle
              handle containing comms_t to use
     """
-    cdef const handle_t * h = < handle_t * > < size_t > handle.getHandle()
+    cdef const device_resources * h = \
+        < device_resources * > < size_t > handle.getHandle()
     return test_commsplit(deref(h), < int > n_colors)
 
 
@@ -254,7 +274,7 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose):
     """
 
     cdef size_t handle_size_t = <size_t>handle.getHandle()
-    handle_ = <handle_t*>handle_size_t
+    handle_ = <device_resources*>handle_size_t
 
     cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
     nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
@@ -296,7 +316,7 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
     cdef void* ucp_worker_st = <void*><size_t>ucp_worker
 
     cdef size_t handle_size_t = <size_t>handle.getHandle()
-    handle_ = <handle_t*>handle_size_t
+    handle_ = <device_resources*>handle_size_t
 
     cdef size_t nccl_comm_size_t = <size_t>nccl_inst.get_comm()
     nccl_comm_ = <ncclComm_t*>nccl_comm_size_t
diff --git a/python/raft-dask/setup.cfg b/python/raft-dask/setup.cfg
index b005a7ab8f..e218f00c3e 100644
--- a/python/raft-dask/setup.cfg
+++ b/python/raft-dask/setup.cfg
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 [versioneer]
 VCS = git
@@ -46,4 +46,4 @@ skip=
 
 [options]
 packages = find:
-python_requires = >=3.7,<3.10
+python_requires = >=3.8,<3.11
diff --git a/python/raft-dask/setup.py b/python/raft-dask/setup.py
index bd21136103..3299ea9bc5 100644
--- a/python/raft-dask/setup.py
+++ b/python/raft-dask/setup.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,11 +26,11 @@
     "numpy",
     "numba>=0.49",
     "joblib>=0.11",
-    "dask-cuda>=23.02",
+    "dask-cuda==23.2.*",
     "dask>=2022.12.0",
-    f"ucx-py{cuda_suffix}",
+    f"ucx-py{cuda_suffix}==0.30.*",
     "distributed>=2022.12.0",
-    f"pylibraft{cuda_suffix}",
+    f"pylibraft{cuda_suffix}==23.2.*",
 ]
 
 extras_require = {
@@ -73,6 +73,7 @@ def get_versions():
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
     author="NVIDIA Corporation",
     include_package_data=True,
diff --git a/thirdparty/LICENSES/LICENSE.faiss b/thirdparty/LICENSES/LICENSE.faiss
new file mode 100644
index 0000000000..87cbf536c6
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.faiss
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file